Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| from transformers import Qwen2VLForConditionalGeneration, AutoProcessor | |
| from qwen_vl_utils import process_vision_info | |
| import torch | |
| import pandas as pd | |
| from datetime import datetime | |
| from azure.storage.blob import BlobServiceClient | |
| from io import BytesIO | |
| import re | |
| # Azure Storage Account details | |
| STORAGE_ACCOUNT_NAME = "piointernaldestrg" | |
| STORAGE_ACCOUNT_KEY = "Pd91QXwgXkiRyd4njM06B9rRFSvtMBijk99N9s7n1M405Kmn4vWzMUmm0vstoYtLLepFmKb9iBaJ+ASt6q+jwg==" | |
| CONTAINER_NAME = "invoices" | |
| # Initialize model and processor | |
| model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct-AWQ", torch_dtype="auto") | |
| if torch.cuda.is_available(): | |
| model.to("cuda") | |
| processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct-AWQ") | |
| # Function to process a batch of images | |
| def process_image_batch(model, processor, image_paths): | |
| results = [] | |
| for image_path in image_paths: | |
| try: | |
| prompt = ( | |
| "Please extract the following details from the invoice:\n" | |
| "- 'invoice_number'\n" | |
| "- 'date'\n" | |
| "- 'place of invoice (city)'\n" | |
| "- 'total amount'\n" | |
| "- 'category of invoice (like food, stay, travel, other)'" | |
| ) | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "image": image_path}, | |
| {"type": "text", "text": prompt}, | |
| ], | |
| } | |
| ] | |
| text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
| image_inputs, video_inputs = process_vision_info(messages) | |
| inputs = processor( | |
| text=[text], | |
| images=image_inputs, | |
| videos=video_inputs, | |
| padding=True, | |
| return_tensors="pt", | |
| ) | |
| inputs = inputs.to(model.device) | |
| generated_ids = model.generate(**inputs, max_new_tokens=128) | |
| generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)] | |
| output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False) | |
| structured_data = { | |
| "invoice_number": None, | |
| "date": None, | |
| "place_of_invoice": None, | |
| "total_amount": None, | |
| "category_of_invoice": None, | |
| } | |
| total_amount_found = False | |
| for line in output_text[0].split("\n"): | |
| # Invoice number mapping logic | |
| if any(keyword in line.lower() for keyword in ["invoice_number", "number in bold", "number", "bill number", "estimate number"]): | |
| structured_data["invoice_number"] = line.split(":")[-1].strip() | |
| # Date mapping logic | |
| elif "date" in line.lower(): | |
| date = line.split(":")[-1].strip() | |
| structured_data["date"] = process_date(date) | |
| # Place of invoice mapping logic | |
| elif "place of invoice" in line.lower(): | |
| structured_data["place_of_invoice"] = line.split(":")[-1].strip() | |
| # Total amount mapping logic | |
| elif any(keyword in line.lower() for keyword in ["total", "total amount", "grand total", "final amount", "balance due"]): | |
| amounts = re.findall(r"\d+\.\d{2}", line) | |
| if amounts: | |
| structured_data["total_amount"] = amounts[-1] | |
| total_amount_found = True | |
| elif not total_amount_found and re.match(r"^\s*TOTAL\s*:\s*\d+\.\d{2}\s*$", line, re.IGNORECASE): | |
| structured_data["total_amount"] = re.findall(r"\d+\.\d{2}", line)[0] | |
| total_amount_found = True | |
| # Category of invoice mapping logic | |
| elif "category of invoice" in line.lower(): | |
| structured_data["category_of_invoice"] = line.split(":")[-1].strip() | |
| results.append(structured_data) | |
| except Exception as e: | |
| results.append({ | |
| "invoice_number": "Error", | |
| "date": "Error", | |
| "place_of_invoice": "Error", | |
| "total_amount": "Error", | |
| "category_of_invoice": str(e), | |
| }) | |
| return pd.DataFrame(results) | |
| # Function to process and format dates | |
| def process_date(date_str): | |
| try: | |
| if re.match(r"\d{2}/\d{2}/\d{4}", date_str): | |
| return date_str | |
| elif re.match(r"\d{2} \w+ \d{4}", date_str): | |
| date_obj = datetime.strptime(date_str, "%d %b %Y") | |
| return date_obj.strftime("%d/%m/%Y") | |
| elif re.match(r"\d{2} \w+", date_str): | |
| date_obj = datetime.strptime(date_str, "%d %b") | |
| return date_obj.strftime("%d/%m") + "/YYYY" | |
| else: | |
| return date_str | |
| except: | |
| return date_str | |
| # Upload extracted data to Azure Blob Storage as a Parquet file | |
| def upload_to_azure_blob(df): | |
| try: | |
| # Convert DataFrame to Parquet format | |
| parquet_buffer = BytesIO() | |
| df.to_parquet(parquet_buffer, index=False) | |
| # Create the BlobServiceClient object | |
| blob_service_client = BlobServiceClient( | |
| account_url=f"https://{STORAGE_ACCOUNT_NAME}.blob.core.windows.net", | |
| credential=STORAGE_ACCOUNT_KEY, | |
| ) | |
| # Get the BlobClient object | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| blob_client = blob_service_client.get_blob_client(container=CONTAINER_NAME, blob=f"invoice_data_{timestamp}.parquet") | |
| # Upload the Parquet file | |
| blob_client.upload_blob(parquet_buffer.getvalue(), overwrite=True) | |
| # Return the file URL | |
| return f"https://{STORAGE_ACCOUNT_NAME}.blob.core.windows.net/{CONTAINER_NAME}/invoice_data_{timestamp}.parquet" | |
| except Exception as e: | |
| return {"error": str(e)} | |
| # Gradio interface function | |
| def gradio_interface(username, email, image_files): | |
| df = process_image_batch(model, processor, image_files) | |
| file_url = upload_to_azure_blob(df) | |
| user_info = f"Username: {username}\nEmail: {email}" | |
| return user_info, df, f"Parquet File URL: {file_url}" | |
| # Define the Gradio interface | |
| grpc_interface = gr.Interface( | |
| fn=gradio_interface, | |
| inputs=[ | |
| gr.Textbox(label="Username"), | |
| gr.Textbox(label="Email"), | |
| gr.Files(label="Upload Invoice Images", type="filepath"), | |
| ], | |
| outputs=[ | |
| gr.Textbox(label="User Info"), | |
| gr.Dataframe(label="Extracted Invoice Data"), | |
| gr.Textbox(label="Parquet File URL"), | |
| ], | |
| title="Invoice Extraction System", | |
| description="Upload invoices, extract details, and save to Azure Blob Storage.", | |
| ) | |
| # Launch the Gradio interface | |
| if __name__ == "__main__": | |
| grpc_interface.launch(share=True) | |