Spaces:
Running
Running
| import os | |
| import re | |
| import time | |
| import fitz | |
| import gradio as gr | |
| from transformers import pipeline | |
| # Load model | |
| ner = pipeline("ner", model="cahya/NusaBert-ner-v1.3", aggregation_strategy="simple") | |
| # Label mapping | |
| LABEL_MAP = { | |
| "CRD": "Kardinal", "DAT": "Tanggal", "EVT": "Peristiwa", | |
| "FAC": "Fasilitas", "GPE": "Entitas Geopolitik", "LAW": "Peraturan / Undang-Undang", | |
| "LOC": "Lokasi", "MON": "Uang", "NOR": "Organisasi Politik", | |
| "ORD": "Ordinal", "ORG": "Organisasi", "PER": "Orang", | |
| "PRC": "Persentase", "PRD": "Produk", "QTY": "Kuantitas", | |
| "REG": "Agama", "TIM": "Waktu", "WOA": "Karya Seni", "LAN": "Bahasa", | |
| } | |
| LABEL_HEX = { | |
| "PER":"#FFBFBF","ORG":"#AEDDFF","NOR":"#8DC7FF","LOC":"#B8FFB8", | |
| "GPE":"#99F2CC","FAC":"#D9FFA5","DAT":"#FFE58C","TIM":"#FFCC66", | |
| "MON":"#CCFFDA","CRD":"#F2CCFF","ORD":"#E0BFFF","PRC":"#FFF2B2", | |
| "QTY":"#C7F2F2","LAW":"#FFBABA","EVT":"#FFD9A5","PRD":"#BFDFFF", | |
| "REG":"#E6DAFF","WOA":"#FFE6DA","LAN":"#CCFFF2", | |
| } | |
| MAX_PDF_PAGES = 5 | |
| MAX_CHUNK_CHARS = 2000 | |
| OVERLAP_CHARS = 150 | |
| EXAMPLES = [ | |
| ("Contoh 1 – RUPS & Dana Cadangan", | |
| "Berdasarkan Rapat Umum Pemegang Saham (RUPS) pada tanggal 24 Juni 2024 yang disahkan " | |
| "oleh notaris Ashoya Ratam, S.H., M.Kn., Risalah No.124/VI/2024, Perusahaan memutuskan " | |
| "antara lain menyisihkan 5% dari laba bersih untuk tahun yang berakhir 31 Desember 2023 " | |
| "atau sebesar Rp5.299.075.507 sebagai dana cadangan jaminan."), | |
| ("Contoh 2 – Akta Jual Beli Saham PEFINDO", | |
| "Berdasarkan Akta Notaris Melinda, S.Sos., S.H., M.Kn dengan No. 17 tanggal 21 Januari " | |
| "2025, Perusahaan dan Dana Pensiun Pertamina telah menandatangani Akta Jual Beli saham " | |
| "dan Perusahaan telah melakukan pembayaran penuh untuk pembelian 5.170 lembar saham " | |
| "PEFINDO yang dimiliki Dana Pensiun Pertamina. Dengan demikian total kepemilikan saham " | |
| "Perusahaan pada tanggal 21 Januari 2025 menjadi sebanyak 37.548 lembar saham atau sama " | |
| "dengan 31,92% kepemilikan di PEFINDO."), | |
| ("Contoh 3 – Fasilitas Kredit Bank Permata", | |
| "Pada tanggal 12 Desember 2022, PEI, entitas anak, dan PT Bank Permata Tbk " | |
| "menandatangani perjanjian fasilitas money market dengan fasilitas kredit maksimum " | |
| "sebesar Rp50.000.000.000. Pinjaman ini digunakan untuk keperluan stand by facility " | |
| "dengan jangka waktu penarikan antara 3 (tiga) hari sampai dengan 3 (tiga) bulan " | |
| "semenjak tanggal penarikan pinjaman dilakukan."), | |
| ("Contoh 4 – Dividen PEFINDO Biro Kredit", | |
| "Berdasarkan Rapat Umum Pemegang Saham Tahunan tanggal 28 Juni 2024, pemegang saham " | |
| "PEFINDO Biro Kredit menyetujui pembagian dividen untuk Perusahaan sebesar Rp6.637.962.683."), | |
| ("Contoh 5 – Regulasi Bursa Karbon", | |
| "Peraturan Presiden RI No. 98 Tahun 2021 tentang Penyelenggaraan Nilai Ekonomi Karbon " | |
| "untuk Pencapaian Target Kontribusi yang Ditetapkan Secara Nasional dan Pengendalian " | |
| "Emisi Gas Rumah Kaca dalam Pembangunan Nasional mengatur mengenai mekanisme pencapaian " | |
| "NDC. Undang-undang RI No. 4 Tahun 2023 tentang Pengembangan dan Penguatan Sektor " | |
| "Keuangan menegaskan bahwa tugas pengaturan dan pengawasan bursa karbon dilakukan oleh " | |
| "Otoritas Jasa Keuangan."), | |
| ] | |
| # Helpers | |
| def clean_word(word: str) -> str: | |
| return word.replace("▁", " ").replace("##", "").strip() | |
| def get_label_id(raw_label: str) -> str: | |
| label_id = raw_label.replace("B-","").replace("I-","").replace("B_","").replace("I_","") | |
| return label_id.split("-")[-1].upper().strip() | |
| def highlight_html(text: str, entity_map: dict) -> str: | |
| sorted_entities = sorted(entity_map.items(), key=lambda x: len(x[0]), reverse=True) | |
| spans = [] | |
| used = [False] * len(text) | |
| for entity_lower, label_id in sorted_entities: | |
| if not entity_lower: | |
| continue | |
| pattern = re.compile(re.escape(entity_lower), re.IGNORECASE) | |
| for m in pattern.finditer(text): | |
| s, e = m.start(), m.end() | |
| if any(used[i] for i in range(s, e)): | |
| continue | |
| spans.append((s, e, label_id)) | |
| for i in range(s, e): | |
| used[i] = True | |
| spans.sort(key=lambda x: x[0]) | |
| parts = [] | |
| cursor = 0 | |
| for s, e, label_id in spans: | |
| if cursor < s: | |
| parts.append(text[cursor:s].replace("\n", "<br>")) | |
| hex_color = LABEL_HEX.get(label_id, "#e2e8f0") | |
| label_idn = LABEL_MAP.get(label_id, label_id) | |
| word = text[s:e] | |
| parts.append( | |
| f'<mark style="background:{hex_color};border-radius:4px;padding:1px 4px;' | |
| f'font-weight:600;" title="{label_idn}">{word}</mark>' | |
| ) | |
| cursor = e | |
| if cursor < len(text): | |
| parts.append(text[cursor:].replace("\n", "<br>")) | |
| return ( | |
| '<div style="font-family:\'DM Sans\',sans-serif;font-size:14px;line-height:2;' | |
| 'color:#1e293b;background:#fff;border-radius:12px;padding:20px 24px;' | |
| 'border:1px solid #e2e8f0;white-space:pre-wrap;">' | |
| + "".join(parts) + "</div>" | |
| ) | |
| # NER Teks | |
| def run_ner(text: str): | |
| if not text or not text.strip(): | |
| return "<p style='color:#94a3b8;font-style:italic;padding:16px;'>Masukkan teks terlebih dahulu.</p>" | |
| results = ner(text.strip()) | |
| if not results: | |
| return "<p style='color:#94a3b8;font-style:italic;padding:16px;'>Tidak ada entitas yang ditemukan.</p>" | |
| rows_html = "" | |
| row_num = 1 | |
| seen_words = set() | |
| for ent in results: | |
| raw_label = ent["entity_group"] | |
| label_id = get_label_id(raw_label) | |
| label_idn = LABEL_MAP.get(label_id, raw_label) | |
| word = clean_word(ent["word"]) | |
| if not word: continue | |
| word_key = word.lower() | |
| if word_key in seen_words: continue | |
| seen_words.add(word_key) | |
| hex_color = LABEL_HEX.get(label_id, "#e2e8f0") | |
| score = f"{ent['score']:.2%}" | |
| row_bg = "#f8faff" if row_num % 2 == 0 else "#ffffff" | |
| rows_html += f""" | |
| <tr style="background:{row_bg};"> | |
| <td style="padding:9px 14px;border-bottom:1px solid #e8edf5;text-align:center;color:#64748b;font-size:12px;">{row_num}</td> | |
| <td style="padding:9px 14px;border-bottom:1px solid #e8edf5;font-weight:600;color:#1e293b;">{word}</td> | |
| <td style="padding:9px 14px;border-bottom:1px solid #e8edf5;"> | |
| <span style="background:{hex_color};padding:3px 10px;border-radius:20px;font-size:12px;font-weight:600;color:#1e293b;">{label_idn}</span> | |
| </td> | |
| <td style="padding:9px 14px;border-bottom:1px solid #e8edf5;font-size:12px;color:#64748b;text-align:center;">{score}</td> | |
| </tr>""" | |
| row_num += 1 | |
| if not rows_html: | |
| return "<p style='color:#94a3b8;font-style:italic;padding:16px;'>Tidak ada entitas yang ditemukan.</p>" | |
| return f""" | |
| <div style="overflow-x:auto;margin-top:4px;border-radius:12px;border:1px solid #e2e8f0;box-shadow:0 2px 12px rgba(0,0,0,0.06);"> | |
| <table style="width:100%;border-collapse:collapse;font-size:14px;font-family:'Segoe UI',sans-serif;"> | |
| <thead> | |
| <tr style="background:linear-gradient(135deg,#1e40af,#6d28d9);"> | |
| <th style="padding:12px 14px;color:#fff;width:55px;font-weight:600;font-size:12px;letter-spacing:0.05em;">NO</th> | |
| <th style="padding:12px 14px;color:#fff;text-align:left;font-weight:600;font-size:12px;letter-spacing:0.05em;">KATA / FRASA</th> | |
| <th style="padding:12px 14px;color:#fff;text-align:left;font-weight:600;font-size:12px;letter-spacing:0.05em;">ENTITAS</th> | |
| <th style="padding:12px 14px;color:#fff;width:90px;font-weight:600;font-size:12px;letter-spacing:0.05em;">SKOR</th> | |
| </tr> | |
| </thead> | |
| <tbody>{rows_html}</tbody> | |
| </table> | |
| </div>""" | |
| def run_ner_file(upload_file): | |
| if upload_file is None: | |
| return "<p style='color:#94a3b8;font-style:italic;padding:16px;'>Unggah file terlebih dahulu.</p>", "" | |
| file_path = upload_file if isinstance(upload_file, str) else upload_file.name | |
| ext = os.path.splitext(file_path)[-1].lower() | |
| if ext == ".txt": | |
| with open(file_path, "r", encoding="utf-8", errors="replace") as f: | |
| full_text = f.read() | |
| page_count = 1 | |
| elif ext == ".pdf": | |
| doc = fitz.open(file_path) | |
| page_count = len(doc) | |
| if page_count > MAX_PDF_PAGES: | |
| doc.close() | |
| return ( | |
| f"<p style='color:#be123c;padding:16px;'>PDF terlalu banyak halaman " | |
| f"({page_count}). Maks {MAX_PDF_PAGES} halaman.</p>", "" | |
| ) | |
| full_text = "\n\n".join(page.get_text() for page in doc) | |
| doc.close() | |
| else: | |
| return "<p style='color:#be123c;padding:16px;'>Format tidak didukung.</p>", "" | |
| # Chunking | |
| chunks = [] | |
| start = 0 | |
| while start < len(full_text): | |
| end = min(start + MAX_CHUNK_CHARS, len(full_text)) | |
| chunks.append(full_text[start:end]) | |
| if end == len(full_text): | |
| break | |
| start = end - OVERLAP_CHARS | |
| # NER | |
| all_ner_results = [] | |
| for chunk in chunks: | |
| all_ner_results.extend(ner(chunk.strip())) | |
| if not all_ner_results: | |
| return "<p style='color:#94a3b8;padding:16px;'>Tidak ada entitas ditemukan.</p>", "" | |
| # Bangun entity_map | |
| entity_map: dict[str, str] = {} | |
| for ent in all_ner_results: | |
| word = clean_word(ent["word"]) | |
| if len(word) < 2: | |
| continue | |
| label_id = get_label_id(ent["entity_group"]) | |
| w_lower = word.lower() | |
| if w_lower not in entity_map: | |
| entity_map[w_lower] = label_id | |
| if not entity_map: | |
| return "<p style='color:#94a3b8;padding:16px;'>Tidak ada entitas ditemukan.</p>", "" | |
| highlighted = highlight_html(full_text, entity_map) | |
| # Badge legend | |
| found_labels = set(entity_map.values()) | |
| badges = "".join( | |
| f'<span style="background:{LABEL_HEX.get(l,"#e2e8f0")};border-radius:20px;' | |
| f'padding:3px 10px;font-size:12px;font-weight:600;color:#1e293b;margin:3px;">' | |
| f'{LABEL_MAP.get(l, l)}</span>' | |
| for l in sorted(found_labels) | |
| ) | |
| legend_html = f'<div style="margin-bottom:12px;line-height:2;">{badges}</div>' | |
| return highlighted, legend_html | |
| # CSS | |
| CUSTOM_CSS = """ | |
| @import url('https://fonts.googleapis.com/css2?family=Playfair+Display:wght@700;900&family=DM+Sans:wght@400;500;600&display=swap'); | |
| .gradio-container{max-width:100%!important;padding:0!important;background:linear-gradient(160deg,#0f172a 0%,#1e1b4b 40%,#0f172a 100%)!important;min-height:100vh;} | |
| #hero-header{background:linear-gradient(135deg,#1e40af 0%,#6d28d9 50%,#be185d 100%);padding:40px 48px 36px;margin:0 0 24px;border-radius:16px;text-align:center;position:relative;overflow:hidden;} | |
| #hero-header::before{content:'';position:absolute;inset:0;background:url("data:image/svg+xml,%3Csvg width='60' height='60' viewBox='0 0 60 60' xmlns='http://www.w3.org/2000/svg'%3E%3Cg fill='none' fill-rule='evenodd'%3E%3Cg fill='%23ffffff' fill-opacity='0.04'%3E%3Cpath d='M36 34v-4h-2v4h-4v2h4v4h2v-4h4v-2h-4zm0-30V0h-2v4h-4v2h4v4h2V6h4V4h-4zM6 34v-4H4v4H0v2h4v4h2v-4h4v-2H6zM6 4V0H4v4H0v2h4v4h2V6h4V4H6z'/%3E%3C/g%3E%3C/g%3E%3C/svg%3E");} | |
| .tab-nav{background:rgba(255,255,255,0.05)!important;border-radius:12px!important;padding:4px!important;border:1px solid rgba(255,255,255,0.1)!important;margin:0!important;} | |
| .tab-nav button{background:transparent!important;color:#94a3b8!important;border-radius:8px!important;padding:10px 24px!important;font-family:'DM Sans',sans-serif!important;font-weight:600!important;font-size:14px!important;transition:all 0.2s!important;} | |
| .tab-nav button.selected{background:linear-gradient(135deg,#1e40af,#6d28d9)!important;color:#ffffff!important;box-shadow:0 4px 12px rgba(109,40,217,0.4)!important;} | |
| label span{color:#cbd5e1!important;font-family:'DM Sans',sans-serif!important;font-weight:600!important;font-size:12px!important;letter-spacing:0.07em!important;text-transform:uppercase!important;} | |
| textarea,.gr-textbox textarea{background:rgba(15,23,42,0.7)!important;border:1.5px solid rgba(255,255,255,0.12)!important;border-radius:10px!important;color:#e2e8f0!important;font-family:'DM Sans',sans-serif!important;font-size:14px!important;padding:14px!important;transition:border 0.2s!important;} | |
| textarea:focus{border-color:#6d28d9!important;outline:none!important;} | |
| .example-btn{display:block!important;width:100%!important;text-align:left!important;padding:12px 16px!important;margin-bottom:8px!important;background:rgba(30,64,175,0.15)!important;border:1px solid rgba(99,102,241,0.30)!important;border-radius:10px!important;cursor:pointer!important;font-size:13px!important;line-height:1.6!important;color:#cbd5e1!important;white-space:normal!important;height:auto!important;font-family:'DM Sans',sans-serif!important;transition:all 0.2s!important;} | |
| .example-btn:hover{background:rgba(109,40,217,0.25)!important;border-color:#6d28d9!important;color:#e2e8f0!important;transform:translateX(3px);} | |
| #analyze-btn,#analyze-pdf-btn{background:linear-gradient(135deg,#1e40af,#6d28d9)!important;color:white!important;font-weight:700!important;font-size:15px!important;border-radius:10px!important;padding:12px 0!important;margin-top:8px!important;font-family:'DM Sans',sans-serif!important;letter-spacing:0.03em!important;box-shadow:0 4px 16px rgba(109,40,217,0.35)!important;transition:all 0.2s!important;border:none!important;} | |
| #analyze-btn:hover,#analyze-pdf-btn:hover{transform:translateY(-2px)!important;box-shadow:0 6px 24px rgba(109,40,217,0.50)!important;} | |
| .section-heading{font-family:'DM Sans',sans-serif;font-weight:700;font-size:11px;letter-spacing:0.12em;text-transform:uppercase;color:#ffffff!important;margin-bottom:12px;display:flex;align-items:center;gap:8px;} | |
| .section-heading::before{content:'';display:inline-block;width:18px;height:2px;background:linear-gradient(90deg,#6d28d9,#be185d);border-radius:2px;} | |
| .gr-file{background:rgba(15,23,42,0.7)!important;border:1.5px dashed rgba(99,102,241,0.40)!important;border-radius:10px!important;color:#94a3b8!important;} | |
| .gr-file-download{background:rgba(30,64,175,0.2)!important;border:1px solid rgba(99,102,241,0.4)!important;border-radius:10px!important;color:#a5b4fc!important;font-family:'DM Sans',sans-serif!important;} | |
| #center-col{max-width:780px!important;margin:0 auto!important;width:100%!important;padding:0 8px!important;} | |
| #footer{text-align:center;padding:20px;color:rgba(148,163,184,0.5);font-family:'DM Sans',sans-serif;font-size:12px;letter-spacing:0.04em;} | |
| .gradio-container h3{color:#e2e8f0!important;} | |
| #center-col p{color:#94a3b8!important;} | |
| """ | |
| HERO_HTML = """ | |
| <div id="hero-header"> | |
| <p style="font-family:'DM Sans',sans-serif;font-size:12px;font-weight:700;letter-spacing:0.20em;text-transform:uppercase;color:rgba(165,180,252,0.8);margin:0 0 10px;">Tugas Kelompok · NLP & Text Mining</p> | |
| <h1 style="font-family:'Playfair Display',serif;font-size:clamp(32px,5vw,56px);font-weight:900;color:#ffffff;margin:0 0 8px;line-height:1.1;text-shadow:0 2px 20px rgba(109,40,217,0.5);"> | |
| NER <span style="color:#a78bfa;">for</span> Financial Statements | |
| </h1> | |
| <p style="font-family:'DM Sans',sans-serif;font-size:clamp(13px,2vw,16px);color:rgba(203,213,225,0.85);margin:0 auto 18px;max-width:640px;line-height:1.6;"> | |
| Implementasi Named Entity Recognition pada Kumpulan<br>Laporan-laporan Keuangan Bahasa Indonesia | |
| </p> | |
| <div style="display:inline-flex;align-items:center;gap:8px;background:rgba(0,0,0,0.25);border:1px solid rgba(255,255,255,0.15);border-radius:20px;padding:6px 16px 6px 8px;"> | |
| <span style="background:linear-gradient(135deg,#1e40af,#6d28d9);border-radius:12px;padding:3px 10px;font-size:11px;font-weight:700;color:#fff;letter-spacing:0.05em;">MODEL</span> | |
| <span style="font-family:monospace;font-size:13px;color:#a5b4fc;">cahya/NusaBert-ner-v1.3</span> | |
| </div> | |
| </div> | |
| """ | |
| # Gradio UI | |
| with gr.Blocks(title="NER for Financial Statements") as demo: | |
| gr.HTML(HERO_HTML) | |
| with gr.Tabs(elem_classes="tab-nav"): | |
| with gr.Tab("Analisis Teks"): | |
| with gr.Column(elem_id="center-col"): | |
| gr.HTML('<div class="section-heading">Contoh Teks</div>') | |
| example_btns = [] | |
| for title, body in EXAMPLES: | |
| btn = gr.Button(f"📌 {title}\n\n{body}", elem_classes="example-btn") | |
| example_btns.append((btn, body)) | |
| gr.HTML('<div class="section-heading" style="margin-top:20px;">Input Teks</div>') | |
| text_input = gr.Textbox(lines=9, placeholder="Ketik atau tempel teks laporan keuangan di sini…", label="", show_label=False) | |
| analyze_btn = gr.Button("Lakukan Analisis", elem_id="analyze-btn") | |
| gr.HTML('<div class="section-heading" style="margin-top:20px;">Hasil Analisis Entitas</div>') | |
| text_output = gr.HTML(value="<p style='color:#64748b;font-size:14px;font-family:DM Sans,sans-serif;padding:20px;text-align:center;'>Masukkan teks lalu klik Lakukan Analisis.</p>") | |
| with gr.Tab("Analisis File"): | |
| with gr.Column(elem_id="center-col"): | |
| gr.HTML('<p style="color:#94a3b8;font-family:DM Sans,sans-serif;font-size:13px;margin:0 0 16px;">' | |
| 'Unggah file .pdf (maks 5 halaman) atau .txt.</p>' | |
| ) | |
| gr.HTML('<div class="section-heading">Unggah File</div>') | |
| pdf_input = gr.File(label="", file_types=[".pdf", ".txt"], type="filepath") | |
| analyze_pdf_btn = gr.Button("Analisis & Highlight Entitas", elem_id="analyze-pdf-btn") | |
| gr.HTML('<div class="section-heading" style="margin-top:20px;">Entitas Ditemukan</div>') | |
| pdf_legend = gr.HTML(value="") | |
| gr.HTML('<div class="section-heading" style="margin-top:12px;">Teks Ter-highlight</div>') | |
| pdf_output = gr.HTML(value="") | |
| gr.HTML('<div id="footer">NER for Financial Statements 2026</div>') | |
| # Wiring | |
| for btn, body in example_btns: | |
| btn.click(fn=lambda b=body: b, inputs=[], outputs=text_input) | |
| analyze_btn.click(fn=run_ner, inputs=text_input, outputs=text_output) | |
| text_input.submit(fn=run_ner, inputs=text_input, outputs=text_output) | |
| analyze_pdf_btn.click( | |
| fn=run_ner_file, | |
| inputs=pdf_input, | |
| outputs=[pdf_output, pdf_legend], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(css=CUSTOM_CSS) | |