Spaces:
Running
Running
| # ruff: noqa: E501, INP001, FBT001 | |
| from __future__ import annotations | |
| from typing import Dict, List, Tuple | |
| import gradio as gr | |
| import torch | |
| from optimum.onnxruntime import ORTModelForTokenClassification | |
| from transformers import AutoTokenizer | |
| # Hugging Face model | |
| MODEL_NAME = "gravitee-io/bert-small-pii-detection" | |
| def load_model() -> Tuple[ORTModelForTokenClassification, AutoTokenizer]: | |
| """Load BERT ONNX model and tokenizer from Hugging Face""" | |
| import os | |
| try: | |
| # Load tokenizer from Hugging Face | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| MODEL_NAME, | |
| token=os.getenv("HUGGINGFACE_TOKEN") | |
| ) | |
| # Try to load quantized model first, fallback to regular model | |
| try: | |
| model = ORTModelForTokenClassification.from_pretrained( | |
| MODEL_NAME, | |
| file_name="model.quant.onnx", | |
| token=os.getenv("HUGGINGFACE_TOKEN") | |
| ) | |
| except: | |
| model = ORTModelForTokenClassification.from_pretrained( | |
| MODEL_NAME, | |
| file_name="model.onnx", | |
| token=os.getenv("HUGGINGFACE_TOKEN") | |
| ) | |
| return model, tokenizer | |
| except Exception as e: | |
| raise ValueError(f"Could not load model {MODEL_NAME}: {e}") | |
| def convert_predictions_to_spans(predictions: List[int], offset_mapping: List[Tuple[int, int]], id2label: Dict[int, str], text: str) -> List[Dict]: | |
| """Convert token-level predictions to entity spans using BIO tagging""" | |
| spans = [] | |
| current_entity = None | |
| for i, (pred, (start, end)) in enumerate(zip(predictions, offset_mapping)): | |
| if start == end == 0: # Skip special tokens | |
| continue | |
| label = id2label[pred] | |
| if label.startswith("B-"): | |
| # Begin new entity | |
| if current_entity: | |
| spans.append(current_entity) | |
| current_entity = { | |
| "start": start, | |
| "end": end, | |
| "label": label[2:].lower(), | |
| "text": text[start:end] | |
| } | |
| elif label.startswith("I-") and current_entity and label[2:].lower() == current_entity["label"]: | |
| # Continue current entity | |
| current_entity["end"] = end | |
| current_entity["text"] = text[current_entity["start"]:end] | |
| elif label == "O": | |
| # Outside any entity | |
| if current_entity: | |
| spans.append(current_entity) | |
| current_entity = None | |
| # Don't forget the last entity | |
| if current_entity: | |
| spans.append(current_entity) | |
| return spans | |
| # Load model during initialization | |
| print("Loading model from Hugging Face...") | |
| _model, _tokenizer = load_model() | |
| print(f"Model {MODEL_NAME} loaded successfully!") | |
| def get_model_info(): | |
| """Get model and tokenizer (already loaded)""" | |
| return _model, _tokenizer | |
| def predict_entities(text: str, threshold: float) -> Dict: | |
| """Predict entities using BERT ONNX model""" | |
| try: | |
| model, tokenizer = get_model_info() | |
| # Tokenize input text | |
| inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, | |
| return_offsets_mapping=True, max_length=512) | |
| offset_mapping = inputs.pop("offset_mapping")[0].tolist() | |
| # Run inference | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| predictions = torch.nn.functional.softmax(outputs.logits, dim=-1) | |
| predicted_class_ids = torch.argmax(predictions, dim=-1)[0].tolist() | |
| prediction_scores = torch.max(predictions, dim=-1)[0][0].tolist() | |
| # Filter by threshold | |
| filtered_predictions = [] | |
| filtered_offsets = [] | |
| for pred, score, offset in zip(predicted_class_ids, prediction_scores, offset_mapping): | |
| if score >= threshold: | |
| filtered_predictions.append(pred) | |
| filtered_offsets.append(offset) | |
| else: | |
| filtered_predictions.append(0) # O tag | |
| filtered_offsets.append(offset) | |
| # Convert to spans | |
| id2label = model.config.id2label | |
| spans = convert_predictions_to_spans(filtered_predictions, filtered_offsets, id2label, text) | |
| # Convert to gradio format | |
| entities = [] | |
| for span in spans: | |
| entities.append({ | |
| "entity": span["label"], | |
| "word": span["text"], | |
| "start": span["start"], | |
| "end": span["end"], | |
| "score": 1.0 # We already filtered by threshold | |
| }) | |
| return { | |
| "text": text, | |
| "entities": entities | |
| } | |
| except Exception as e: | |
| return { | |
| "text": text, | |
| "entities": [], | |
| "error": str(e) | |
| } | |
| def format_text(text: str, format_type: str) -> str: | |
| """Format text with proper spacing and indentation""" | |
| if format_type == "None": | |
| return text | |
| elif format_type == "JSON": | |
| try: | |
| import json | |
| # Try to parse and format as JSON | |
| parsed = json.loads(text) | |
| return json.dumps(parsed, indent=2) | |
| except: | |
| return text | |
| elif format_type == "XML": | |
| try: | |
| import xml.etree.ElementTree as ET | |
| from xml.dom import minidom | |
| # Remove b' prefix if present | |
| clean_text = text | |
| if text.startswith("b'") and text.endswith("'"): | |
| clean_text = text[2:-1] | |
| # Parse and format XML | |
| root = ET.fromstring(clean_text) | |
| rough_string = ET.tostring(root, 'unicode') | |
| reparsed = minidom.parseString(rough_string) | |
| return reparsed.toprettyxml(indent=" ") | |
| except: | |
| return text | |
| elif format_type == "HTML": | |
| try: | |
| from bs4 import BeautifulSoup | |
| soup = BeautifulSoup(text, 'html.parser') | |
| return soup.prettify() | |
| except: | |
| # Fallback: simple HTML formatting | |
| formatted = text.replace('><', '>\n<') | |
| formatted = formatted.replace('<tr>', '\n <tr>') | |
| formatted = formatted.replace('<td>', '\n <td>') | |
| formatted = formatted.replace('<th>', '\n <th>') | |
| return formatted | |
| elif format_type == "SQL": | |
| # Simple SQL formatting | |
| formatted = text.upper() | |
| formatted = formatted.replace(' FROM ', '\nFROM ') | |
| formatted = formatted.replace(' WHERE ', '\nWHERE ') | |
| formatted = formatted.replace(' AND ', '\n AND ') | |
| formatted = formatted.replace(' OR ', '\n OR ') | |
| formatted = formatted.replace(' ORDER BY ', '\nORDER BY ') | |
| formatted = formatted.replace(' GROUP BY ', '\nGROUP BY ') | |
| formatted = formatted.replace(' HAVING ', '\nHAVING ') | |
| formatted = formatted.replace(' LIMIT ', '\nLIMIT ') | |
| return formatted | |
| else: | |
| return text | |
| def ner(text: str, threshold: float, data_type: str = None, format_input: bool = False) -> List[Tuple[str, str]]: | |
| """Main NER function for Gradio interface""" | |
| # Format text if requested | |
| if format_input and data_type and data_type != "Documents": | |
| formatted_text = format_text(text, data_type) | |
| result = predict_entities(formatted_text, threshold) | |
| display_text = formatted_text | |
| else: | |
| result = predict_entities(text, threshold) | |
| display_text = text | |
| if "error" in result: | |
| return [(display_text, None)] | |
| # Convert to highlighted text format | |
| highlighted = [] | |
| last_end = 0 | |
| for entity in sorted(result["entities"], key=lambda x: x["start"]): | |
| # Add text before entity | |
| if entity["start"] > last_end: | |
| highlighted.append((display_text[last_end:entity["start"]], None)) | |
| # Add entity | |
| highlighted.append((entity["word"], entity["entity"].upper())) | |
| last_end = entity["end"] | |
| # Add remaining text | |
| if last_end < len(display_text): | |
| highlighted.append((display_text[last_end:], None)) | |
| return highlighted | |
| examples = [ | |
| # JSON samples | |
| [ | |
| '{\"api_key\": \"9ewl5\", \"page\": \"82\", \"max_primary_general_date\": \"1998-02-01\", \"sort\": \"nz siw\", \"election_type_id\": \"guerv jgwbunon guerv\", \"election_district\": \"03vpuute\", \"max_election_date\": \"1980-12-30\", \"sort_null_only\": \"false\", \"min_election_date\": \"2003-03-05\", \"per_page\": \"96\", \"min_primary_general_date\": \"1991-05-29\", \"election_state\": \"f9u4gfgt pzji\", \"election_party\": \"\", \"min_update_date\": \"1998-01-26\", \"sort_nulls_last\": \"false\", \"max_create_date\": \"1970-10-19\", \"office_sought\": \"rz1thr5zp\", \"max_update_date\": \"2018-12-12\", \"sort_hide_null\": \"true\", \"election_year\": \"alrcfqpswf\", \"min_create_date\": \"2003-02-18\"}', | |
| 0.35, | |
| "JSON" | |
| ], | |
| [ | |
| '{\"sort\": \"\", \"incumbent_challenge\": \"rQ a\", \"longitude\": \"-98.705515\", \"has_raised_funds\": \"True\", \"airport\": \"New Orleans International airport\", \"office\": \"\", \"candidate_status\": \"e\", \"district\": \"\", \"sort_nulls_last\": \"True\", \"per_page\": \"344387016\", \"state\": \"Texas\", \"location\": \"-89.030682\", \"airport_icao\": \"KOKC\", \"api_key\": \"\", \"origin airport code\": \"LIS\", \"year\": \"2012\", \"sort_hide_null\": \"False\", \"cycle\": \"VAnEFSGu LDiJQtw LDiJQtw\", \"lat\": \"33.182925\", \"sort_null_only\": \"False\", \"page\": \"5661254\", \"election_year\": \"\", \"federal_funds_flag\": \"False\", \"party\": \"\", \"name\": \"OSsUo\"}', | |
| 0.35, | |
| "JSON" | |
| ], | |
| [ | |
| '{\"nationality\": \"American\", \"keyStorePass\": \"LObizj\", \":operation\": \"XSnpUioywM iOF5gN1bHM\", \"currentPassword\": \"wo3vooch8Ie\", \"nation_plural\": \"north-americans\", \"alias\": \"aoJPk aoJPk\", \"prefix\": \"Mr.\", \"prefix_male\": \"Mr.\", \"newAlias\": \"\", \"nation_woman\": \"western samoan\", \"newPassword\": \"UVpvCQ UVpvCQ\", \"keyPassword\": \"k4GWWlP@@z\", \"nation_man\": \"bahraini\", \"rePassword\": \"\", \"removeAlias\": \"o\"}', | |
| 0.35, | |
| "JSON" | |
| ], | |
| [ | |
| '{\"imei\": \"27-051998-738345-4\", \"post-code\": \"28403\", \"startTime\": \"1996-04-20 02:21:52\", \"timeGrain\": \"0f8Jl9qmZ3 cJSVXOylw\", \"longitude\": \"-77.952502\", \"latitude\": \"34.258789\", \"endTime\": \"1994-08-17 13:38:00\", \"api-version\": \"HDjWC jcOLlPG8W\", \"key store password\": \"ahZeT2ee\", \"bank account\": \"KEKY41344355014443\"}', | |
| 0.35, | |
| "JSON" | |
| ], | |
| # SQL samples | |
| [ | |
| 'SELECT \"endTime,startTime,age,nation_woman,national identity,arline name,airport_icao,coordinate,api-version\",\"api-version\",CASE WHEN \"endTime\" THEN \'skin\' WHEN \"startTime\"=\'1992-01-13 23:33:10\' THEN \'president\' WHEN \"age\"=\'31\' THEN \'be\' WHEN \"nation_woman\"=\'syrian\' THEN \'particular\' WHEN \"national identity\"<>\'600233955\' THEN \'trip\' WHEN \"arline name\"<>\'Shanghai Airlines\' THEN \'present\' WHEN \"airport_icao\"<>\'SBJP\' THEN \'forget\' WHEN \"coordinate\"=\'52.297060\' THEN \'car\' WHEN \"api-version\" THEN \'also\' END FROM \"not\" WHERE \"endTime\" AND \"startTime\"=\'1973-12-27 11:08:01\' AND (\"age\"=\'64\' OR \"age\"=\'answer\') AND \"nation_woman\"<>\'guyanese\' AND \"national identity\"<>\'142451774\' AND \"arline name\" AND \"airport_icao\" AND \"coordinate\"=\'46.828790\' AND (\"api-version\"=\'KOikhS KOikhS yz\' OR \"api-version\"=\'activity\') LIMIT 64', | |
| 0.35, | |
| "SQL" | |
| ], | |
| [ | |
| 'SELECT \"week__day,Version,Tags,age,currency_code,TargetBucket,expiration-date,TargetSnapshotName,swift-code,KmsKeyId,Action,debit card,SourceSnapshotName\",\"SourceSnapshotName\",CASE WHEN \"week__day\"=\'Saturday\' THEN \'serious\' WHEN \"Version\"=\'2015-02-02\' OR \"Version\"=\'staff\' THEN \'country\' WHEN \"Tags\"<>\'\' THEN \'water\' WHEN \"age\" THEN \'behind\' WHEN \"currency_code\"=\'CAD\' THEN \'position\' WHEN \"TargetBucket\" THEN \'next\' WHEN \"expiration-date\"=\'11/2023\' OR \"expiration-date\"=\'technology\' THEN \'kid\' WHEN \"TargetSnapshotName\"=\'pWJ\' OR \"TargetSnapshotName\"=\'give\' THEN \'child\' WHEN \"swift-code\"=\'GWIZGBQPBUW\' THEN \'poor\' WHEN \"KmsKeyId\" THEN \'meeting\' WHEN \"Action\"=\'CopySnapshot\' THEN \'collection\' WHEN \"debit card\"<>\'30381983513092\' THEN \'paper\' WHEN \"SourceSnapshotName\"=\'\' THEN \'keep\' END FROM \"statement\" WHERE \"week__day\"=\'Tuesday\' AND \"Version\"=\'2015-02-02\' AND \"Tags\"=\'\' AND \"age\"=\'20\' AND \"currency_code\"=\'MGA\' AND \"TargetBucket\"=\'\' AND \"expiration-date\"=\'02/24\' AND \"TargetSnapshotName\"=\'\' AND \"swift-code\"=\'GNCHGBZC\' AND \"KmsKeyId\"=\'\' AND \"Action\"=\'CopySnapshot\' AND \"debit card\"=\'4534384187682\' AND \"SourceSnapshotName\"=\'\' LIMIT 36', | |
| 0.35, | |
| "SQL" | |
| ], | |
| [ | |
| 'SELECT \"expiration-date,prettyPrint,alt,master-card,arline__name,key,bank city,fields,building,quotaUser,userIp,to country code,oauth_token\",\"oauth_token\",CASE WHEN \"expiration-date\"=\'3/2024\' THEN \'reduce\' WHEN \"prettyPrint\"=\'False\' OR \"prettyPrint\"=\'south\' THEN \'within\' WHEN \"alt\"<>\'json\' THEN \'thing\' WHEN \"master-card\" THEN \'strategy\' WHEN \"arline__name\"=\'Air India\' THEN \'forward\' WHEN \"key\" THEN \'artist\' WHEN \"bank city\"=\'Helena\' OR \"bank city\"=\'more\' THEN \'pay\' WHEN \"fields\"=\'\' OR \"fields\"=\'thing\' THEN \'rest\' WHEN \"building\"=\'977\' THEN \'executive\' WHEN \"quotaUser\" THEN \'safe\' WHEN \"userIp\"=\'pWJ\' THEN \'whom\' WHEN \"to country code\"<>\'US\' THEN \'not\' WHEN \"oauth_token\"=\'\' THEN \'choice\' END FROM \"wrong\" WHERE (\"expiration-date\"=\'05/23\' OR \"expiration-date\"=\'language\') AND \"prettyPrint\"=\'True\' AND \"alt\"<>\'json\' AND \"master-card\"=\'349245482859346\' AND \"arline__name\"=\'Indonesia AirAsia\' AND \"key\"=\'\' AND \"bank city\"=\'Georgetown\' AND \"fields\"=\'\' AND \"building\"=\'7241\' AND \"quotaUser\"=\'\' AND \"userIp\"=\'\' AND \"to country code\"=\'TM\' AND \"oauth_token\"=\'\' LIMIT 64', | |
| 0.35, | |
| "SQL" | |
| ], | |
| [ | |
| 'SELECT `schemaName,databaseName,city,building,coordinate,state_abbreviation,driver license,international__mobile__equipment__identity`,`international__mobile__equipment__identity`,CASE WHEN `schemaName`<>\'fX04 bHQKn bHQKn\' THEN \'far\' WHEN `databaseName` THEN \'college\' WHEN `city`=\'Orlando\' OR `city`=\'probably\' THEN \'boy\' WHEN `building`<>\'2672\' THEN \'wind\' WHEN `coordinate`=\'-21.907687\' THEN \'offer\' WHEN `state_abbreviation`=\'FL\' THEN \'its\' WHEN `driver license`=\'H872538367807\' THEN \'lose\' WHEN `international__mobile__equipment__identity`=\'42-161139-363377-6\' OR `international__mobile__equipment__identity`=\'attention\' THEN \'nor\' END FROM `business` WHERE (`schemaName`=\'BfgAeXWjbC BfgAeXWjbC\' OR `schemaName`=\'across\') AND `databaseName`<>\'hw w\' AND `city`=\'West Caroline\' AND `building`<>\'44030\' AND `coordinate`=\'-21.907687\' AND `state_abbreviation`=\'IA\' AND `driver license`=\'224242065\' AND `international__mobile__equipment__identity`=\'83-695777-883364-1\' LIMIT 10', | |
| 0.35, | |
| "SQL" | |
| ], | |
| # XML samples | |
| [ | |
| 'b\'<?xml version=\"1.0\" encoding=\"UTF-8\" ?><root><sort type=\"str\"></sort><incumbent_challenge type=\"str\"></incumbent_challenge><longitude type=\"str\">-97.518538</longitude><has_raised_funds type=\"str\">True</has_raised_funds><airport type=\"str\">John F Kennedy International airport</airport><office type=\"str\">IDuqbH m</office><candidate_status type=\"str\">qEw3Tpc wmYqRUtTH</candidate_status><district type=\"str\">D UCd6ZAFD D</district><sort_nulls_last type=\"str\">False</sort_nulls_last><per_page type=\"str\">7720</per_page><state type=\"str\">South Dakota</state><location type=\"str\">-109.575655</location><airport_icao type=\"str\">EDDH</airport_icao><api_key type=\"str\">46nCNe0 Wj Wj</api_key><origin_airport_code type=\"str\">DEN</origin_airport_code><year type=\"str\">1996</year><sort_hide_null type=\"str\">False</sort_hide_null><cycle type=\"str\">FNxL</cycle><lat type=\"str\">43.16524</lat><sort_null_only type=\"str\">False</sort_null_only><page type=\"str\">4894426</page><election_year type=\"str\"></election_year><federal_funds_flag type=\"str\">False</federal_funds_flag><party type=\"str\"></party><name type=\"str\">aKPjF</name></root>\'', | |
| 0.35, | |
| "XML" | |
| ], | |
| [ | |
| 'b\'<?xml version=\"1.0\" encoding=\"UTF-8\" ?><root><api_key type=\"str\">E hMCQl hMCQl</api_key><page type=\"str\">984478</page><max_primary_general_date type=\"str\">2008-01-29</max_primary_general_date><sort type=\"str\"></sort><election_type_id type=\"str\">L85O2N</election_type_id><election_district type=\"str\">M</election_district><max_election_date type=\"str\">2017-08-07</max_election_date><sort_null_only type=\"str\">False</sort_null_only><min_election_date type=\"str\">2007-07-01</min_election_date><per_page type=\"str\">452141118</per_page><min_primary_general_date type=\"str\">1977-07-12</min_primary_general_date><election_state type=\"str\"></election_state><election_party type=\"str\">CH4 Ceq Ceq</election_party><min_update_date type=\"str\">1980-04-11</min_update_date><sort_nulls_last type=\"str\">False</sort_nulls_last><max_create_date type=\"str\">1997-04-23</max_create_date><max_update_date type=\"str\">2020-12-25</max_update_date><sort_hide_null type=\"str\">True</sort_hide_null><election_year type=\"str\">v0rF4t8</election_year><min_create_date type=\"str\">2013-11-30</min_create_date></root>\'', | |
| 0.35, | |
| "XML" | |
| ], | |
| [ | |
| 'b\'<?xml version=\"1.0\" encoding=\"UTF-8\" ?><root><nationality type=\"str\">American</nationality><last_name_male type=\"str\">Hayden</last_name_male><NextToken type=\"str\">YX8Fh4d NiOugSJPwm NiOugSJPwm</NextToken><StartDate type=\"str\">2007-04-07</StartDate><EndDate type=\"str\">1971-05-28</EndDate><family-name-female type=\"str\">Weishaar</family-name-female><PageSize type=\"str\">19750435</PageSize><prefix_male type=\"str\">Mr.</prefix_male><given__name__female type=\"str\">Dara</given__name__female><nation_man type=\"str\">bulgarian</nation_man></root>\'', | |
| 0.35, | |
| "XML" | |
| ], | |
| [ | |
| 'b\'<?xml version=\"1.0\" encoding=\"UTF-8\" ?><root><imei type=\"str\">30-696164-389965-5</imei><post-code type=\"str\">33179</post-code><startTime type=\"str\">2017-02-05 13:11:21</startTime><timeGrain type=\"str\">S</timeGrain><longitude type=\"str\">-80.270951</longitude><latitude type=\"str\">25.898545</latitude><endTime type=\"str\">1990-02-04 22:51:09</endTime><api-version type=\"str\">Ad Ad wM5NWqRt</api-version><key_store_password type=\"str\">Shohr3aep</key_store_password><bank_account type=\"str\">BZEV05211288606606</bank_account></root>\'', | |
| 0.35, | |
| "XML" | |
| ], | |
| # HTML samples | |
| [ | |
| '<table border=\"1\"><tr><th>api_key</th><td>PmtrSlgEzO PmtrSlgEzO br</td></tr><tr><th>page</th><td>73595</td></tr><tr><th>max_primary_general_date</th><td>1992-09-22</td></tr><tr><th>sort</th><td>RqJu PZwhjrbcS</td></tr><tr><th>election_type_id</th><td>PFTZDOBxIl</td></tr><tr><th>election_district</th><td>XNc7rk</td></tr><tr><th>max_election_date</th><td>2007-02-15</td></tr><tr><th>sort_null_only</th><td>False</td></tr><tr><th>min_election_date</th><td>2014-06-27</td></tr><tr><th>per_page</th><td>62971536</td></tr><tr><th>min_primary_general_date</th><td>1982-03-22</td></tr><tr><th>election_state</th><td>xzJis</td></tr><tr><th>election_party</th><td>lHUet 1vtAg5J lHUet</td></tr><tr><th>min_update_date</th><td>1984-07-25</td></tr><tr><th>sort_nulls_last</th><td>False</td></tr><tr><th>max_create_date</th><td>1980-01-02</td></tr><tr><th>max_update_date</th><td>1997-11-10</td></tr><tr><th>sort_hide_null</th><td>True</td></tr><tr><th>election_year</th><td>hNf2nYGMbX</td></tr><tr><th>min_create_date</th><td>2000-11-25</td></tr></table>', | |
| 0.35, | |
| "HTML" | |
| ], | |
| [ | |
| '<table border=\"1\"><tr><th>religion</th><td>Christianity</td></tr><tr><th>api-version</th><td>dCwMNqR</td></tr><tr><th>to_contact</th><td>[email protected]</td></tr><tr><th>spot</th><td>6765 2278 Norma Avenue Mcbee , SC 33987</td></tr><tr><th>endTime</th><td>2022-09-07 14:17:30</td></tr><tr><th>startTime</th><td>2001-09-20 20:45:43</td></tr><tr><th>facility</th><td>Apt. 074</td></tr><tr><th>vocation</th><td>Lay-out worker</td></tr><tr><th>alley</th><td>1697 2496 White Pine Lane Apt. 904</td></tr></table>', | |
| 0.35, | |
| "HTML" | |
| ], | |
| [ | |
| '<table border=\"1\"><tr><th>imei</th><td>25-894407-891989-9</td></tr><tr><th>post-code</th><td>2142</td></tr><tr><th>startTime</th><td>2001-06-20 10:16:33</td></tr><tr><th>timeGrain</th><td></td></tr><tr><th>longitude</th><td>-70.990988</td></tr><tr><th>latitude</th><td>42.32382</td></tr><tr><th>endTime</th><td>1971-08-20 19:09:13</td></tr><tr><th>api-version</th><td>u zNS zNS</td></tr><tr><th>key store password</th><td>teiy1oD5ie</td></tr><tr><th>bank account</th><td>FILW85959012098599</td></tr></table>', | |
| 0.35, | |
| "HTML" | |
| ], | |
| [ | |
| '<table border=\"1\"><tr><th>country</th><td>United States</td></tr><tr><th>address</th><td>0133 2669 Locust Street Suite 601 Fort Gaines United States</td></tr><tr><th>project</th><td></td></tr><tr><th>nation_plural</th><td>vietnameses</td></tr><tr><th>urban__area</th><td>Buena Park</td></tr><tr><th>region</th><td>California</td></tr><tr><th>street</th><td>01474 3910 Melody Lane Apt. 383</td></tr><tr><th>phone-country-code</th><td>US</td></tr><tr><th>spot</th><td>Apt. 554</td></tr></table>', | |
| 0.35, | |
| "HTML" | |
| ], | |
| # Natural Text examples | |
| [ | |
| "Dr. Sarah Martinez, age 34, works as a Senior Data Scientist at TechCorp International. Her employee ID is TC-DS-5591 and she joined the company on 2019-03-15. Sarah lives at 1247 Oak Avenue, Apartment 5B, Portland, Oregon 97205. Her work phone is 503-555-0147 and personal email is [email protected]. For banking, she uses account TCBK89012345678901 at First National Bank. Her driver's license number is OR-DL-M8829134 and her social security number is 123-45-6789. She recently traveled to London using passport US-P-543216789 and her frequent flyer number with Delta Airlines is DL987654321.", | |
| 0.35, | |
| "Documents" | |
| ], | |
| [ | |
| "The customer database contains the following entries: Michael Chen (DOB: 1985-07-22, age 38) residing at 789 Pine Street, Suite 200, San Francisco, CA 94102. His contact details include phone 415-555-0298 and email [email protected]. Financial information: Chase Bank account CH-5567889012345678, credit card 4532-1234-5678-9012 (exp: 08/2027, CVV: 451). Professional details: Software Engineer at InnovateTech LLC, employee ID IT-SE-7793, salary $125,000. Government IDs include SSN 987-65-4321, California driver's license CA-DL-B1234567, and passport number US-578912345. His device MAC address is aa:bb:cc:dd:ee:ff and IMEI 358240051111110.", | |
| 0.35, | |
| "Documents" | |
| ], | |
| [ | |
| "Security incident report for Lisa Thompson (ID: LT-2023-001): On 2023-11-15 at 14:30 PST, user accessed system from IP address 192.168.1.100 using API key api_key_abc123xyz789. Employee details: Lisa Thompson, age 29, title Senior Security Analyst, department Cybersecurity, hired 2021-09-01. Home address: 456 Maple Drive, Unit 3C, Seattle, WA 98109. Contact: phone 206-555-0189, work email [email protected]. Banking: Wells Fargo account WF-4455667788990011, routing number 021000021. Government IDs: SSN 555-44-3333, WA driver's license WA-DL-THOMP567, passport US-890123456. Vehicle: 2020 Honda Civic, license plate WA-ABC1234, VIN 1HGBH41JXMN109186.", | |
| 0.35, | |
| "Documents" | |
| ], | |
| [ | |
| "Patient intake form: Dr. Robert Kim (Medical License: MD-12345-WA), age 42, practices at Seattle General Hospital, 1500 Medical Center Drive, Seattle, WA 98101. Phone: 206-555-0234, fax: 206-555-0235, email: [email protected]. Patient information: Jennifer Walsh, DOB 1990-12-03 (age 33), SSN 111-22-3333, address 2100 Broadway Ave, Apt 15D, Seattle, WA 98122. Insurance: Blue Cross Blue Shield, policy BC-556677889900, group 12345. Emergency contact: Mark Walsh (spouse), phone 206-555-0167. Medical history includes prescription for Medication XYZ, DEA number DR1234567. Appointment scheduled for 2024-01-20 at 10:00 AM, confirmation code CONF-789456.", | |
| 0.35, | |
| "Documents" | |
| ], | |
| ] | |
| with gr.Blocks(title="Gravitee BERT PII") as demo: | |
| gr.Markdown( | |
| f""" | |
| # Gravitee BERT PII (Personally Identifiable Information extraction) | |
| This application uses the **{MODEL_NAME}** model for Named Entity Recognition (NER) to detect personally identifiable information. | |
| The model uses token classification with BIO tagging to identify predefined entity types including names, addresses, | |
| financial information, and more. | |
| """ | |
| ) | |
| with gr.Accordion("Available Entity Types", open=False): | |
| gr.Markdown( | |
| """ | |
| The BERT models can detect the following entity types: | |
| **Personal Information:** | |
| - PERSON (names) | |
| - AGE | |
| - PHONE_NUMBER | |
| - EMAIL_ADDRESS | |
| **Location & Address:** | |
| - LOCATION | |
| - COORDINATE | |
| **Financial:** | |
| - CREDIT_CARD | |
| - IBAN_CODE | |
| - FINANCIAL | |
| - US_BANK_NUMBER | |
| **Government IDs:** | |
| - US_SSN (Social Security Number) | |
| - US_DRIVER_LICENSE | |
| - US_PASSPORT | |
| - US_ITIN | |
| - US_LICENSE_PLATE | |
| - NRP (National Registration Number) | |
| **Technical:** | |
| - IP_ADDRESS | |
| - MAC_ADDRESS | |
| - URL | |
| - IMEI | |
| - PASSWORD | |
| **Other:** | |
| - DATE_TIME | |
| - ORGANIZATION | |
| - TITLE | |
| """ | |
| ) | |
| with gr.Accordion("How to run this model locally", open=False): | |
| gr.Markdown( | |
| """ | |
| ## Installation | |
| To use this model, install the required dependencies: | |
| ``` | |
| pip install transformers optimum[onnxruntime] torch | |
| ``` | |
| ## Usage | |
| Load the model using the Optimum library for ONNX Runtime: | |
| ```python | |
| from optimum.onnxruntime import ORTModelForTokenClassification | |
| from transformers import AutoTokenizer | |
| model_path = "gravitee-io/bert-small-pii-detection" | |
| tokenizer = AutoTokenizer.from_pretrained(model_path) | |
| model = ORTModelForTokenClassification.from_pretrained(model_path, file_name="model.onnx") | |
| text = "John Doe lives at 123 Main St and his email is [email protected]" | |
| inputs = tokenizer(text, return_tensors="pt", return_offsets_mapping=True) | |
| outputs = model(**inputs) | |
| ``` | |
| """ | |
| ) | |
| input_text = gr.Textbox( | |
| value=examples[0][0], | |
| label="Text input", | |
| placeholder="Enter your text here" | |
| ) | |
| with gr.Row(): | |
| threshold = gr.Slider( | |
| 0, | |
| 1, | |
| value=0.35, | |
| step=0.01, | |
| label="Confidence Threshold", | |
| info="Lower the threshold to get more predictions with lower confidence.", | |
| scale=2 | |
| ) | |
| data_type_display = gr.Textbox( | |
| value=examples[0][2], | |
| label="Data Type", | |
| interactive=False, | |
| scale=1 | |
| ) | |
| format_checkbox = gr.Checkbox( | |
| value=False, | |
| label="Format Text", | |
| info="Auto-format JSON, XML, HTML, SQL with proper indentation", | |
| scale=1 | |
| ) | |
| output = gr.HighlightedText(label="Predicted Entities") | |
| submit_btn = gr.Button("Submit") | |
| examples_component = gr.Examples( | |
| examples, | |
| fn=ner, | |
| inputs=[input_text, threshold, data_type_display, format_checkbox], | |
| outputs=output, | |
| cache_examples=False, | |
| ) | |
| # Event handlers | |
| input_text.submit(fn=ner, inputs=[input_text, threshold, data_type_display, format_checkbox], outputs=output) | |
| threshold.release(fn=ner, inputs=[input_text, threshold, data_type_display, format_checkbox], outputs=output) | |
| format_checkbox.change(fn=ner, inputs=[input_text, threshold, data_type_display, format_checkbox], outputs=output) | |
| submit_btn.click(fn=ner, inputs=[input_text, threshold, data_type_display, format_checkbox], outputs=output) | |
| if __name__ == "__main__": | |
| demo.queue() | |
| demo.launch(debug=True) |