Spaces:

Illia56
/

PDF_OCR_OPENAI

Running

App Files Files Community

Illia56 commited on May 1

Commit

ce622d5

verified ·

1 Parent(s): 72c2d3d

Upload 2 files

Browse files

Files changed (2) hide show

openai_app.py +122 -0
requirements.txt +2 -0

openai_app.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import gradio as gr
+from pdf2image import convert_from_path
+import base64
+from openai import OpenAI
+from io import BytesIO
+import concurrent.futures
+import json
+import os
+import zipfile
+import tempfile
+import shutil
+import os
+if not os.environ.get("OPENAI_API_KEY"):
+    raise ValueError("OPENAI_API_KEY is not set")
+client = OpenAI()
+def encode_pil_image(pil_image):
+    buffered = BytesIO()
+    pil_image.save(buffered, format="JPEG")
+    return base64.b64encode(buffered.getvalue()).decode("utf-8")
+def extract_markdown_from_image(image, idx):
+    base64_image = encode_pil_image(image)
+    try:
+        completion = client.chat.completions.create(
+            model="o4-mini",
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        { "type": "text", "text": "Extract the text from this page and return it as markdown, with the best possible quality and accuracy." },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{base64_image}",
+                                "detail": "high"
+                            },
+                        },
+                    ],
+                }
+            ],
+        )
+        return idx, completion.choices[0].message.content
+    except Exception as e:
+        print(e)
+        return idx, f"Error processing page {idx}: {e}"
+def pdf_to_json_and_md_zip_with_progress(pdf_file, progress=gr.Progress(track_tqdm=True)):
+    # Save uploaded file to a temp path if needed
+    if hasattr(pdf_file, "name"):
+        pdf_path = pdf_file.name
+    else:
+        # Gradio may pass a str path or a file object
+        pdf_path = pdf_file
+    images = convert_from_path(pdf_path)
+    num_pages = len(images)
+    results = [None] * num_pages
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        futures = []
+        for i in range(num_pages):
+            futures.append(executor.submit(extract_markdown_from_image, images[i], i))
+        for idx, future in enumerate(concurrent.futures.as_completed(futures)):
+            idx_result, content = future.result()
+            results[idx_result] = content.replace("```markdown", "").replace("```", "")
+            progress((idx + 1) / num_pages, desc=f"Processing page {idx_result + 1} of {num_pages}")
+    output_json = [
+        {"page": idx + 1, "markdown": content}
+        for idx, content in enumerate(results)
+    ]
+    # Create a temporary directory to store md files and json
+    temp_dir = tempfile.mkdtemp()
+    md_folder = os.path.join(temp_dir, "pages")
+    os.makedirs(md_folder, exist_ok=True)
+    # Write each page as a separate .md file
+    for idx, content in enumerate(results):
+        md_path = os.path.join(md_folder, f"page_{idx+1}.md")
+        with open(md_path, "w", encoding="utf-8") as f:
+            f.write(content.strip())
+    # Write the JSON file
+    output_json_path = os.path.join(temp_dir, "ocr_output.json")
+    with open(output_json_path, "w", encoding="utf-8") as f:
+        json.dump(output_json, f, ensure_ascii=False, indent=2)
+    # Create a zip file containing the folder with md files and the json
+    zip_path = os.path.join(temp_dir, "ocr_output.zip")
+    with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
+        # Add the JSON file
+        zipf.write(output_json_path, arcname="ocr_output.json")
+        # Add the md files folder and its contents
+        for root, dirs, files in os.walk(md_folder):
+            for file in files:
+                file_path = os.path.join(root, file)
+                arcname = os.path.relpath(file_path, temp_dir)
+                zipf.write(file_path, arcname=arcname)
+    return zip_path
+with gr.Blocks() as demo:
+    gr.Markdown("# PDF to Markdown & JSON OCR (OpenAI Vision)\nUpload a PDF file. Each page will be processed and the extracted markdown will be saved as separate .md files in a folder, and all results will be zipped together with a JSON file.")
+    pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
+    zip_output = gr.File(label="Download ZIP (md files + JSON)", interactive=False)
+    def process_and_return_zip(pdf_file, progress=gr.Progress(track_tqdm=True)):
+        zip_path = pdf_to_json_and_md_zip_with_progress(pdf_file, progress=progress)
+        return zip_path
+    process_btn = gr.Button("Convert PDF to ZIP")
+    process_btn.click(
+        process_and_return_zip,
+        inputs=[pdf_input],
+        outputs=[zip_output]
+    )
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ pdf2image
2	+ openai