RangGaraga's picture
Update app.py
314d95d verified
import os
import re
import time
import fitz
import gradio as gr
from transformers import pipeline
# Load model
ner = pipeline("ner", model="cahya/NusaBert-ner-v1.3", aggregation_strategy="simple")
# Label mapping
LABEL_MAP = {
"CRD": "Kardinal", "DAT": "Tanggal", "EVT": "Peristiwa",
"FAC": "Fasilitas", "GPE": "Entitas Geopolitik", "LAW": "Peraturan / Undang-Undang",
"LOC": "Lokasi", "MON": "Uang", "NOR": "Organisasi Politik",
"ORD": "Ordinal", "ORG": "Organisasi", "PER": "Orang",
"PRC": "Persentase", "PRD": "Produk", "QTY": "Kuantitas",
"REG": "Agama", "TIM": "Waktu", "WOA": "Karya Seni", "LAN": "Bahasa",
}
LABEL_HEX = {
"PER":"#FFBFBF","ORG":"#AEDDFF","NOR":"#8DC7FF","LOC":"#B8FFB8",
"GPE":"#99F2CC","FAC":"#D9FFA5","DAT":"#FFE58C","TIM":"#FFCC66",
"MON":"#CCFFDA","CRD":"#F2CCFF","ORD":"#E0BFFF","PRC":"#FFF2B2",
"QTY":"#C7F2F2","LAW":"#FFBABA","EVT":"#FFD9A5","PRD":"#BFDFFF",
"REG":"#E6DAFF","WOA":"#FFE6DA","LAN":"#CCFFF2",
}
MAX_PDF_PAGES = 5
MAX_CHUNK_CHARS = 2000
OVERLAP_CHARS = 150
EXAMPLES = [
("Contoh 1 – RUPS & Dana Cadangan",
"Berdasarkan Rapat Umum Pemegang Saham (RUPS) pada tanggal 24 Juni 2024 yang disahkan "
"oleh notaris Ashoya Ratam, S.H., M.Kn., Risalah No.124/VI/2024, Perusahaan memutuskan "
"antara lain menyisihkan 5% dari laba bersih untuk tahun yang berakhir 31 Desember 2023 "
"atau sebesar Rp5.299.075.507 sebagai dana cadangan jaminan."),
("Contoh 2 – Akta Jual Beli Saham PEFINDO",
"Berdasarkan Akta Notaris Melinda, S.Sos., S.H., M.Kn dengan No. 17 tanggal 21 Januari "
"2025, Perusahaan dan Dana Pensiun Pertamina telah menandatangani Akta Jual Beli saham "
"dan Perusahaan telah melakukan pembayaran penuh untuk pembelian 5.170 lembar saham "
"PEFINDO yang dimiliki Dana Pensiun Pertamina. Dengan demikian total kepemilikan saham "
"Perusahaan pada tanggal 21 Januari 2025 menjadi sebanyak 37.548 lembar saham atau sama "
"dengan 31,92% kepemilikan di PEFINDO."),
("Contoh 3 – Fasilitas Kredit Bank Permata",
"Pada tanggal 12 Desember 2022, PEI, entitas anak, dan PT Bank Permata Tbk "
"menandatangani perjanjian fasilitas money market dengan fasilitas kredit maksimum "
"sebesar Rp50.000.000.000. Pinjaman ini digunakan untuk keperluan stand by facility "
"dengan jangka waktu penarikan antara 3 (tiga) hari sampai dengan 3 (tiga) bulan "
"semenjak tanggal penarikan pinjaman dilakukan."),
("Contoh 4 – Dividen PEFINDO Biro Kredit",
"Berdasarkan Rapat Umum Pemegang Saham Tahunan tanggal 28 Juni 2024, pemegang saham "
"PEFINDO Biro Kredit menyetujui pembagian dividen untuk Perusahaan sebesar Rp6.637.962.683."),
("Contoh 5 – Regulasi Bursa Karbon",
"Peraturan Presiden RI No. 98 Tahun 2021 tentang Penyelenggaraan Nilai Ekonomi Karbon "
"untuk Pencapaian Target Kontribusi yang Ditetapkan Secara Nasional dan Pengendalian "
"Emisi Gas Rumah Kaca dalam Pembangunan Nasional mengatur mengenai mekanisme pencapaian "
"NDC. Undang-undang RI No. 4 Tahun 2023 tentang Pengembangan dan Penguatan Sektor "
"Keuangan menegaskan bahwa tugas pengaturan dan pengawasan bursa karbon dilakukan oleh "
"Otoritas Jasa Keuangan."),
]
# Helpers
def clean_word(word: str) -> str:
return word.replace("▁", " ").replace("##", "").strip()
def get_label_id(raw_label: str) -> str:
label_id = raw_label.replace("B-","").replace("I-","").replace("B_","").replace("I_","")
return label_id.split("-")[-1].upper().strip()
def highlight_html(text: str, entity_map: dict) -> str:
sorted_entities = sorted(entity_map.items(), key=lambda x: len(x[0]), reverse=True)
spans = []
used = [False] * len(text)
for entity_lower, label_id in sorted_entities:
if not entity_lower:
continue
pattern = re.compile(re.escape(entity_lower), re.IGNORECASE)
for m in pattern.finditer(text):
s, e = m.start(), m.end()
if any(used[i] for i in range(s, e)):
continue
spans.append((s, e, label_id))
for i in range(s, e):
used[i] = True
spans.sort(key=lambda x: x[0])
parts = []
cursor = 0
for s, e, label_id in spans:
if cursor < s:
parts.append(text[cursor:s].replace("\n", "<br>"))
hex_color = LABEL_HEX.get(label_id, "#e2e8f0")
label_idn = LABEL_MAP.get(label_id, label_id)
word = text[s:e]
parts.append(
f'<mark style="background:{hex_color};border-radius:4px;padding:1px 4px;'
f'font-weight:600;" title="{label_idn}">{word}</mark>'
)
cursor = e
if cursor < len(text):
parts.append(text[cursor:].replace("\n", "<br>"))
return (
'<div style="font-family:\'DM Sans\',sans-serif;font-size:14px;line-height:2;'
'color:#1e293b;background:#fff;border-radius:12px;padding:20px 24px;'
'border:1px solid #e2e8f0;white-space:pre-wrap;">'
+ "".join(parts) + "</div>"
)
# NER Teks
def run_ner(text: str):
if not text or not text.strip():
return "<p style='color:#94a3b8;font-style:italic;padding:16px;'>Masukkan teks terlebih dahulu.</p>"
results = ner(text.strip())
if not results:
return "<p style='color:#94a3b8;font-style:italic;padding:16px;'>Tidak ada entitas yang ditemukan.</p>"
rows_html = ""
row_num = 1
seen_words = set()
for ent in results:
raw_label = ent["entity_group"]
label_id = get_label_id(raw_label)
label_idn = LABEL_MAP.get(label_id, raw_label)
word = clean_word(ent["word"])
if not word: continue
word_key = word.lower()
if word_key in seen_words: continue
seen_words.add(word_key)
hex_color = LABEL_HEX.get(label_id, "#e2e8f0")
score = f"{ent['score']:.2%}"
row_bg = "#f8faff" if row_num % 2 == 0 else "#ffffff"
rows_html += f"""
<tr style="background:{row_bg};">
<td style="padding:9px 14px;border-bottom:1px solid #e8edf5;text-align:center;color:#64748b;font-size:12px;">{row_num}</td>
<td style="padding:9px 14px;border-bottom:1px solid #e8edf5;font-weight:600;color:#1e293b;">{word}</td>
<td style="padding:9px 14px;border-bottom:1px solid #e8edf5;">
<span style="background:{hex_color};padding:3px 10px;border-radius:20px;font-size:12px;font-weight:600;color:#1e293b;">{label_idn}</span>
</td>
<td style="padding:9px 14px;border-bottom:1px solid #e8edf5;font-size:12px;color:#64748b;text-align:center;">{score}</td>
</tr>"""
row_num += 1
if not rows_html:
return "<p style='color:#94a3b8;font-style:italic;padding:16px;'>Tidak ada entitas yang ditemukan.</p>"
return f"""
<div style="overflow-x:auto;margin-top:4px;border-radius:12px;border:1px solid #e2e8f0;box-shadow:0 2px 12px rgba(0,0,0,0.06);">
<table style="width:100%;border-collapse:collapse;font-size:14px;font-family:'Segoe UI',sans-serif;">
<thead>
<tr style="background:linear-gradient(135deg,#1e40af,#6d28d9);">
<th style="padding:12px 14px;color:#fff;width:55px;font-weight:600;font-size:12px;letter-spacing:0.05em;">NO</th>
<th style="padding:12px 14px;color:#fff;text-align:left;font-weight:600;font-size:12px;letter-spacing:0.05em;">KATA / FRASA</th>
<th style="padding:12px 14px;color:#fff;text-align:left;font-weight:600;font-size:12px;letter-spacing:0.05em;">ENTITAS</th>
<th style="padding:12px 14px;color:#fff;width:90px;font-weight:600;font-size:12px;letter-spacing:0.05em;">SKOR</th>
</tr>
</thead>
<tbody>{rows_html}</tbody>
</table>
</div>"""
def run_ner_file(upload_file):
if upload_file is None:
return "<p style='color:#94a3b8;font-style:italic;padding:16px;'>Unggah file terlebih dahulu.</p>", ""
file_path = upload_file if isinstance(upload_file, str) else upload_file.name
ext = os.path.splitext(file_path)[-1].lower()
if ext == ".txt":
with open(file_path, "r", encoding="utf-8", errors="replace") as f:
full_text = f.read()
page_count = 1
elif ext == ".pdf":
doc = fitz.open(file_path)
page_count = len(doc)
if page_count > MAX_PDF_PAGES:
doc.close()
return (
f"<p style='color:#be123c;padding:16px;'>PDF terlalu banyak halaman "
f"({page_count}). Maks {MAX_PDF_PAGES} halaman.</p>", ""
)
full_text = "\n\n".join(page.get_text() for page in doc)
doc.close()
else:
return "<p style='color:#be123c;padding:16px;'>Format tidak didukung.</p>", ""
# Chunking
chunks = []
start = 0
while start < len(full_text):
end = min(start + MAX_CHUNK_CHARS, len(full_text))
chunks.append(full_text[start:end])
if end == len(full_text):
break
start = end - OVERLAP_CHARS
# NER
all_ner_results = []
for chunk in chunks:
all_ner_results.extend(ner(chunk.strip()))
if not all_ner_results:
return "<p style='color:#94a3b8;padding:16px;'>Tidak ada entitas ditemukan.</p>", ""
# Bangun entity_map
entity_map: dict[str, str] = {}
for ent in all_ner_results:
word = clean_word(ent["word"])
if len(word) < 2:
continue
label_id = get_label_id(ent["entity_group"])
w_lower = word.lower()
if w_lower not in entity_map:
entity_map[w_lower] = label_id
if not entity_map:
return "<p style='color:#94a3b8;padding:16px;'>Tidak ada entitas ditemukan.</p>", ""
highlighted = highlight_html(full_text, entity_map)
# Badge legend
found_labels = set(entity_map.values())
badges = "".join(
f'<span style="background:{LABEL_HEX.get(l,"#e2e8f0")};border-radius:20px;'
f'padding:3px 10px;font-size:12px;font-weight:600;color:#1e293b;margin:3px;">'
f'{LABEL_MAP.get(l, l)}</span>'
for l in sorted(found_labels)
)
legend_html = f'<div style="margin-bottom:12px;line-height:2;">{badges}</div>'
return highlighted, legend_html
# CSS
CUSTOM_CSS = """
@import url('https://fonts.googleapis.com/css2?family=Playfair+Display:wght@700;900&family=DM+Sans:wght@400;500;600&display=swap');
.gradio-container{max-width:100%!important;padding:0!important;background:linear-gradient(160deg,#0f172a 0%,#1e1b4b 40%,#0f172a 100%)!important;min-height:100vh;}
#hero-header{background:linear-gradient(135deg,#1e40af 0%,#6d28d9 50%,#be185d 100%);padding:40px 48px 36px;margin:0 0 24px;border-radius:16px;text-align:center;position:relative;overflow:hidden;}
#hero-header::before{content:'';position:absolute;inset:0;background:url("data:image/svg+xml,%3Csvg width='60' height='60' viewBox='0 0 60 60' xmlns='http://www.w3.org/2000/svg'%3E%3Cg fill='none' fill-rule='evenodd'%3E%3Cg fill='%23ffffff' fill-opacity='0.04'%3E%3Cpath d='M36 34v-4h-2v4h-4v2h4v4h2v-4h4v-2h-4zm0-30V0h-2v4h-4v2h4v4h2V6h4V4h-4zM6 34v-4H4v4H0v2h4v4h2v-4h4v-2H6zM6 4V0H4v4H0v2h4v4h2V6h4V4H6z'/%3E%3C/g%3E%3C/g%3E%3C/svg%3E");}
.tab-nav{background:rgba(255,255,255,0.05)!important;border-radius:12px!important;padding:4px!important;border:1px solid rgba(255,255,255,0.1)!important;margin:0!important;}
.tab-nav button{background:transparent!important;color:#94a3b8!important;border-radius:8px!important;padding:10px 24px!important;font-family:'DM Sans',sans-serif!important;font-weight:600!important;font-size:14px!important;transition:all 0.2s!important;}
.tab-nav button.selected{background:linear-gradient(135deg,#1e40af,#6d28d9)!important;color:#ffffff!important;box-shadow:0 4px 12px rgba(109,40,217,0.4)!important;}
label span{color:#cbd5e1!important;font-family:'DM Sans',sans-serif!important;font-weight:600!important;font-size:12px!important;letter-spacing:0.07em!important;text-transform:uppercase!important;}
textarea,.gr-textbox textarea{background:rgba(15,23,42,0.7)!important;border:1.5px solid rgba(255,255,255,0.12)!important;border-radius:10px!important;color:#e2e8f0!important;font-family:'DM Sans',sans-serif!important;font-size:14px!important;padding:14px!important;transition:border 0.2s!important;}
textarea:focus{border-color:#6d28d9!important;outline:none!important;}
.example-btn{display:block!important;width:100%!important;text-align:left!important;padding:12px 16px!important;margin-bottom:8px!important;background:rgba(30,64,175,0.15)!important;border:1px solid rgba(99,102,241,0.30)!important;border-radius:10px!important;cursor:pointer!important;font-size:13px!important;line-height:1.6!important;color:#cbd5e1!important;white-space:normal!important;height:auto!important;font-family:'DM Sans',sans-serif!important;transition:all 0.2s!important;}
.example-btn:hover{background:rgba(109,40,217,0.25)!important;border-color:#6d28d9!important;color:#e2e8f0!important;transform:translateX(3px);}
#analyze-btn,#analyze-pdf-btn{background:linear-gradient(135deg,#1e40af,#6d28d9)!important;color:white!important;font-weight:700!important;font-size:15px!important;border-radius:10px!important;padding:12px 0!important;margin-top:8px!important;font-family:'DM Sans',sans-serif!important;letter-spacing:0.03em!important;box-shadow:0 4px 16px rgba(109,40,217,0.35)!important;transition:all 0.2s!important;border:none!important;}
#analyze-btn:hover,#analyze-pdf-btn:hover{transform:translateY(-2px)!important;box-shadow:0 6px 24px rgba(109,40,217,0.50)!important;}
.section-heading{font-family:'DM Sans',sans-serif;font-weight:700;font-size:11px;letter-spacing:0.12em;text-transform:uppercase;color:#ffffff!important;margin-bottom:12px;display:flex;align-items:center;gap:8px;}
.section-heading::before{content:'';display:inline-block;width:18px;height:2px;background:linear-gradient(90deg,#6d28d9,#be185d);border-radius:2px;}
.gr-file{background:rgba(15,23,42,0.7)!important;border:1.5px dashed rgba(99,102,241,0.40)!important;border-radius:10px!important;color:#94a3b8!important;}
.gr-file-download{background:rgba(30,64,175,0.2)!important;border:1px solid rgba(99,102,241,0.4)!important;border-radius:10px!important;color:#a5b4fc!important;font-family:'DM Sans',sans-serif!important;}
#center-col{max-width:780px!important;margin:0 auto!important;width:100%!important;padding:0 8px!important;}
#footer{text-align:center;padding:20px;color:rgba(148,163,184,0.5);font-family:'DM Sans',sans-serif;font-size:12px;letter-spacing:0.04em;}
.gradio-container h3{color:#e2e8f0!important;}
#center-col p{color:#94a3b8!important;}
"""
HERO_HTML = """
<div id="hero-header">
<p style="font-family:'DM Sans',sans-serif;font-size:12px;font-weight:700;letter-spacing:0.20em;text-transform:uppercase;color:rgba(165,180,252,0.8);margin:0 0 10px;">Tugas Kelompok · NLP &amp; Text Mining</p>
<h1 style="font-family:'Playfair Display',serif;font-size:clamp(32px,5vw,56px);font-weight:900;color:#ffffff;margin:0 0 8px;line-height:1.1;text-shadow:0 2px 20px rgba(109,40,217,0.5);">
NER <span style="color:#a78bfa;">for</span> Financial Statements
</h1>
<p style="font-family:'DM Sans',sans-serif;font-size:clamp(13px,2vw,16px);color:rgba(203,213,225,0.85);margin:0 auto 18px;max-width:640px;line-height:1.6;">
Implementasi Named Entity Recognition pada Kumpulan<br>Laporan-laporan Keuangan Bahasa Indonesia
</p>
<div style="display:inline-flex;align-items:center;gap:8px;background:rgba(0,0,0,0.25);border:1px solid rgba(255,255,255,0.15);border-radius:20px;padding:6px 16px 6px 8px;">
<span style="background:linear-gradient(135deg,#1e40af,#6d28d9);border-radius:12px;padding:3px 10px;font-size:11px;font-weight:700;color:#fff;letter-spacing:0.05em;">MODEL</span>
<span style="font-family:monospace;font-size:13px;color:#a5b4fc;">cahya/NusaBert-ner-v1.3</span>
</div>
</div>
"""
# Gradio UI
with gr.Blocks(title="NER for Financial Statements") as demo:
gr.HTML(HERO_HTML)
with gr.Tabs(elem_classes="tab-nav"):
with gr.Tab("Analisis Teks"):
with gr.Column(elem_id="center-col"):
gr.HTML('<div class="section-heading">Contoh Teks</div>')
example_btns = []
for title, body in EXAMPLES:
btn = gr.Button(f"📌 {title}\n\n{body}", elem_classes="example-btn")
example_btns.append((btn, body))
gr.HTML('<div class="section-heading" style="margin-top:20px;">Input Teks</div>')
text_input = gr.Textbox(lines=9, placeholder="Ketik atau tempel teks laporan keuangan di sini…", label="", show_label=False)
analyze_btn = gr.Button("Lakukan Analisis", elem_id="analyze-btn")
gr.HTML('<div class="section-heading" style="margin-top:20px;">Hasil Analisis Entitas</div>')
text_output = gr.HTML(value="<p style='color:#64748b;font-size:14px;font-family:DM Sans,sans-serif;padding:20px;text-align:center;'>Masukkan teks lalu klik Lakukan Analisis.</p>")
with gr.Tab("Analisis File"):
with gr.Column(elem_id="center-col"):
gr.HTML('<p style="color:#94a3b8;font-family:DM Sans,sans-serif;font-size:13px;margin:0 0 16px;">'
'Unggah file .pdf (maks 5 halaman) atau .txt.</p>'
)
gr.HTML('<div class="section-heading">Unggah File</div>')
pdf_input = gr.File(label="", file_types=[".pdf", ".txt"], type="filepath")
analyze_pdf_btn = gr.Button("Analisis & Highlight Entitas", elem_id="analyze-pdf-btn")
gr.HTML('<div class="section-heading" style="margin-top:20px;">Entitas Ditemukan</div>')
pdf_legend = gr.HTML(value="")
gr.HTML('<div class="section-heading" style="margin-top:12px;">Teks Ter-highlight</div>')
pdf_output = gr.HTML(value="")
gr.HTML('<div id="footer">NER for Financial Statements 2026</div>')
# Wiring
for btn, body in example_btns:
btn.click(fn=lambda b=body: b, inputs=[], outputs=text_input)
analyze_btn.click(fn=run_ner, inputs=text_input, outputs=text_output)
text_input.submit(fn=run_ner, inputs=text_input, outputs=text_output)
analyze_pdf_btn.click(
fn=run_ner_file,
inputs=pdf_input,
outputs=[pdf_output, pdf_legend],
)
if __name__ == "__main__":
demo.launch(css=CUSTOM_CSS)