Spaces:
Runtime error
Runtime error
| """A Gradio app for anonymizing text data using FHE.""" | |
| import os | |
| import re | |
| from typing import Dict, List | |
| import gradio as gr | |
| import pandas as pd | |
| from fhe_anonymizer import FHEAnonymizer | |
| from openai import OpenAI | |
| from utils_demo import * | |
| ORIGINAL_DOCUMENT = read_txt(ORIGINAL_FILE_PATH).split("\n\n") | |
| ANONYMIZED_DOCUMENT = read_txt(ANONYMIZED_FILE_PATH) | |
| MAPPING_SENTENCES = read_pickle(MAPPING_SENTENCES_PATH) | |
| clean_directory() | |
| anonymizer = FHEAnonymizer() | |
| client = OpenAI(api_key=os.environ.get("openaikey")) | |
| def select_static_sentences_fn(selected_sentences: List): | |
| selected_sentences = [MAPPING_SENTENCES[sentence] for sentence in selected_sentences] | |
| anonymized_selected_sentence = sorted(selected_sentences, key=lambda x: x[0]) | |
| anonymized_selected_sentence = [sentence for _, sentence in anonymized_selected_sentence] | |
| return {anonymized_doc_box: gr.update(value="\n\n".join(anonymized_selected_sentence))} | |
| def key_gen_fn() -> Dict: | |
| """Generate keys for a given user. | |
| Returns: | |
| dict: A dictionary containing the generated keys and related information. | |
| """ | |
| print("Key Gen..") | |
| anonymizer.generate_key() | |
| evaluation_key_path = KEYS_DIR / "evaluation_key" | |
| if not evaluation_key_path.is_file(): | |
| error_message = ( | |
| f"Error Encountered While generating the evaluation {evaluation_key_path.is_file()=}" | |
| ) | |
| print(error_message) | |
| return {gen_key_btn: gr.update(value=error_message)} | |
| else: | |
| return {gen_key_btn: gr.update(value="Keys have been generated β ")} | |
| def encrypt_query_fn(query): | |
| print(f"Query: {query}") | |
| evaluation_key_path = KEYS_DIR / "evaluation_key" | |
| if not evaluation_key_path.is_file(): | |
| error_message = "Error β: Please generate the key first!" | |
| return {output_encrypted_box: gr.update(value=error_message)} | |
| if is_user_query_valid(query): | |
| # TODO: check if the query is related to our context | |
| error_msg = ( | |
| "Unable to process β: The request exceeds the length limit or falls " | |
| "outside the scope of this document. Please refine your query." | |
| ) | |
| print(error_msg) | |
| return {query_box: gr.update(value=error_msg)} | |
| anonymizer.encrypt_query(query) | |
| encrypted_tokens = read_pickle(KEYS_DIR / "encrypted_quantized_query") | |
| encrypted_quant_tokens_hex = [token.hex()[500:510] for token in encrypted_tokens] | |
| return {output_encrypted_box: gr.update(value=" ".join(encrypted_quant_tokens_hex))} | |
| def run_fhe_fn(query_box): | |
| evaluation_key_path = KEYS_DIR / "evaluation_key" | |
| if not evaluation_key_path.is_file(): | |
| error_message = "Error β: Please generate the key first!" | |
| return {anonymized_text_output: gr.update(value=error_message)} | |
| encryted_query_path = KEYS_DIR / "encrypted_quantized_query" | |
| if not encryted_query_path.is_file(): | |
| error_message = "Error β: Please encrypt your query first!" | |
| return {anonymized_text_output: gr.update(value=error_message)} | |
| anonymizer.run_server_and_decrypt_output(query_box) | |
| anonymized_text = read_pickle(KEYS_DIR / "reconstructed_sentence") | |
| identified_words_with_prob = read_pickle(KEYS_DIR / "identified_words_with_prob") | |
| # Convert the list of identified words and probabilities into a DataFrame | |
| if identified_words_with_prob: | |
| identified_df = pd.DataFrame( | |
| identified_words_with_prob, columns=["Identified Words", "Probability"] | |
| ) | |
| else: | |
| identified_df = pd.DataFrame(columns=["Identified Words", "Probability"]) | |
| return anonymized_text, identified_df | |
| def query_chatgpt_fn(anonymized_query, anonymized_document): | |
| evaluation_key_path = KEYS_DIR / "evaluation_key" | |
| if not evaluation_key_path.is_file(): | |
| error_message = "Error β: Please generate the key first!" | |
| return {anonymized_text_output: gr.update(value=error_message)} | |
| encryted_query_path = KEYS_DIR / "encrypted_quantized_query" | |
| if not encryted_query_path.is_file(): | |
| error_message = "Error β: Please encrypt your query first!" | |
| return {anonymized_text_output: gr.update(value=error_message)} | |
| decrypted_query_path = KEYS_DIR / "reconstructed_sentence" | |
| if not decrypted_query_path.is_file(): | |
| error_message = "Error β: Please run the FHE computation first!" | |
| return {anonymized_text_output: gr.update(value=error_message)} | |
| prompt = read_txt(PROMPT_PATH) | |
| # Prepare prompt | |
| full_prompt = prompt + "\n" | |
| query = ( | |
| "Document content:\n```\n" | |
| + anonymized_document | |
| + "\n\n```" | |
| + "Query:\n```\n" | |
| + anonymized_query | |
| + "\n```" | |
| ) | |
| print(full_prompt) | |
| completion = client.chat.completions.create( | |
| model="gpt-4-1106-preview", # Replace with "gpt-4" if available | |
| messages=[ | |
| {"role": "system", "content": prompt}, | |
| {"role": "user", "content": query}, | |
| ], | |
| ) | |
| anonymized_response = completion.choices[0].message.content | |
| uuid_map = read_json(MAPPING_UUID_PATH) | |
| inverse_uuid_map = { | |
| v: k for k, v in uuid_map.items() | |
| } # TODO load the inverse mapping from disk for efficiency | |
| # Pattern to identify words and non-words (including punctuation, spaces, etc.) | |
| tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)", anonymized_response) | |
| processed_tokens = [] | |
| for token in tokens: | |
| # Directly append non-word tokens or whitespace to processed_tokens | |
| if not token.strip() or not re.match(r"\w+", token): | |
| processed_tokens.append(token) | |
| continue | |
| if token in inverse_uuid_map: | |
| processed_tokens.append(inverse_uuid_map[token]) | |
| else: | |
| processed_tokens.append(token) | |
| deanonymized_response = "".join(processed_tokens) | |
| return anonymized_response, deanonymized_response | |
| demo = gr.Blocks(css=".markdown-body { font-size: 18px; }") | |
| with demo: | |
| gr.Markdown( | |
| """ | |
| <p align="center"> | |
| <img width=200 src="file/images/logos/zama.jpg"> | |
| </p> | |
| <h1 style="text-align: center;">Encrypted Anonymization Using Fully Homomorphic Encryption</h1> | |
| <p align="center"> | |
| <a href="https://github.com/zama-ai/concrete-ml"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/github.png">Concrete-ML</a> | |
| β | |
| <a href="https://docs.zama.ai/concrete-ml"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/documentation.png">Documentation</a> | |
| β | |
| <a href=" https://community.zama.ai/c/concrete-ml/8"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/community.png">Community</a> | |
| β | |
| <a href="https://twitter.com/zama_fhe"> <img style="vertical-align: middle; display:inline-block; margin-right: 3px;" width=15 src="file/images/logos/x.png">@zama_fhe</a> | |
| </p> | |
| """ | |
| ) | |
| # gr.Markdown( | |
| # """ | |
| # <p align="center"> | |
| # <img width="15%" height="15%" src="./encrypted_anonymization_diagram.jpg"> | |
| # </p> | |
| # """ | |
| # ) | |
| with gr.Accordion("What is encrypted anonymization?", open=False): | |
| gr.Markdown( | |
| <<<<<<< HEAD | |
| """ | |
| Anonymization is the process of removing personally identifiable information (PII) | |
| ======= | |
| """Anonymization is the process of removing personally identifiable information (PII) | |
| >>>>>>> 053bec9 (chore: update with marketing remarks) | |
| from data to protect individual privacy. | |
| To resolve trust issues when deploying anonymization as a cloud service, Fully Homomorphic | |
| Encryption (FHE) can be used to preserve the privacy of the original data using | |
| encryption. | |
| The data remains encrypted throughout the anonymization process, eliminating the need for | |
| third-party access to the raw data. Once the data is anonymized, it can safely be sent | |
| to GenAI services such as ChatGPT. | |
| """ | |
| ) | |
| ########################## Key Gen Part ########################## | |
| gr.Markdown( | |
| "### Key generation\n\n" | |
| """In FHE schemes, two sets of keys are generated. First, secret keys are used for | |
| encrypting and decrypting data owned by the client. Second, evaluation keys allow a server | |
| to blindly process the encrypted data. """ | |
| ) | |
| gen_key_btn = gr.Button("Generate the private and evaluation keys") | |
| gen_key_btn.click( | |
| key_gen_fn, | |
| inputs=[], | |
| outputs=[gen_key_btn], | |
| ) | |
| ########################## Main document Part ########################## | |
| gr.Markdown("## Private document") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown( | |
| """This document was retrieved from the [Microsoft Presidio](https://huggingface.co/spaces/presidio/presidio_demo) demo.\n\n | |
| You can select and deselect sentences to customize the document that will be used | |
| as the initial prompt for ChatGPT in this space's final stage.\n\n | |
| """ | |
| ) | |
| with gr.Column(): | |
| gr.Markdown( | |
| """You can see the anonymized document that is sent to ChatGPT here. | |
| ChatGPT will answer any queries that you have about the document below. | |
| The anonymized information is replaced with hexadecimal strings. | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| original_sentences_box = gr.CheckboxGroup( | |
| ORIGINAL_DOCUMENT, value=ORIGINAL_DOCUMENT, label="Original document:" | |
| ) | |
| with gr.Column(): | |
| anonymized_doc_box = gr.Textbox( | |
| label="Anonymized document:", value=ANONYMIZED_DOCUMENT, interactive=False, lines=11 | |
| ) | |
| original_sentences_box.change( | |
| fn=select_static_sentences_fn, | |
| inputs=[original_sentences_box], | |
| outputs=[anonymized_doc_box], | |
| ) | |
| ########################## User Query Part ########################## | |
| gr.Markdown("<hr />") | |
| gr.Markdown("## Private query") | |
| gr.Markdown( | |
| """Now, formulate a query regarding the selected document.\n\n | |
| Choose from predefined options in 'Example Queries' or craft a custom query | |
| in the 'User Query' box. Keep your question concise and relevant to the text's | |
| context. Any off-topic question will not be processed. | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=5): | |
| with gr.Column(scale=5): | |
| default_query_box = gr.Dropdown( | |
| list(DEFAULT_QUERIES.values()), label="Example queries" | |
| ) | |
| query_box = gr.Textbox( | |
| value="Who lives in Maine?", label="User query", interactive=True | |
| ) | |
| default_query_box.change( | |
| fn=lambda default_query_box: default_query_box, | |
| inputs=[default_query_box], | |
| outputs=[query_box], | |
| ) | |
| with gr.Column(scale=1, min_width=6): | |
| gr.HTML("<div style='height: 25px;'></div>") | |
| gr.Markdown( | |
| """ | |
| <p align="center"> | |
| Encrypt data locally with FHE π» βοΈ | |
| </p> | |
| """ | |
| ) | |
| encrypt_btn = gr.Button("Encrypt data") | |
| gr.HTML("<div style='height: 25px;'></div>") | |
| with gr.Column(scale=5): | |
| output_encrypted_box = gr.Textbox( | |
| label="Encrypted anonymized query that is sent to the anonymization server", lines=6 | |
| ) | |
| encrypt_btn.click( | |
| fn=encrypt_query_fn, inputs=[query_box], outputs=[query_box, output_encrypted_box] | |
| ) | |
| gr.Markdown("<hr />") | |
| gr.Markdown("## Secure anonymization with FHE") | |
| gr.Markdown( | |
| """ | |
| Once the client encrypts the private query locally, | |
| the client transmits it to a remote server to perform the | |
| anonymization on encrypted data. When the computation is finished, the server returns | |
| the result to the client for decryption. | |
| """ | |
| ) | |
| run_fhe_btn = gr.Button("Anonymize with FHE") | |
| anonymized_text_output = gr.Textbox( | |
| label="Decrypted anonymized query that will be sent to ChatGPT", lines=1, interactive=True | |
| ) | |
| identified_words_output = gr.Dataframe(label="Identified words", visible=False) | |
| run_fhe_btn.click( | |
| run_fhe_fn, | |
| inputs=[query_box], | |
| outputs=[anonymized_text_output, identified_words_output], | |
| ) | |
| gr.Markdown("<hr />") | |
| gr.Markdown("## Secure your communication on ChatGPT with anonymized queries") | |
| gr.Markdown( | |
| """After securely anonymizing the query with FHE, | |
| you can forward it to ChatGPT without any concern for information leakage.""" | |
| ) | |
| chatgpt_button = gr.Button("Query ChatGPT") | |
| with gr.Row(): | |
| chatgpt_response_anonymized = gr.Textbox(label="ChatGPT anonymized response", lines=13) | |
| chatgpt_response_deanonymized = gr.Textbox( | |
| label="ChatGPT non-anonymized response", lines=13 | |
| ) | |
| chatgpt_button.click( | |
| query_chatgpt_fn, | |
| inputs=[anonymized_text_output, anonymized_doc_box], | |
| outputs=[chatgpt_response_anonymized, chatgpt_response_deanonymized], | |
| ) | |
| gr.Markdown( | |
| """**Please Note**: As this space is intended solely for demonstration purposes, some | |
| private information may be missed the the anonymization algorithm. Please validate the | |
| following query before sending it to ChatGPT.""" | |
| ) | |
| <<<<<<< HEAD | |
| ======= | |
| >>>>>>> 053bec9 (chore: update with marketing remarks) | |
| # Launch the app | |
| demo.launch(share=False) | |