|
|
""" |
|
|
PLOBIN |
|
|
""" |
|
|
import difflib |
|
|
import streamlit as st |
|
|
import streamlit.components.v1 as components |
|
|
import fitz |
|
|
import chromadb |
|
|
from sentence_transformers import SentenceTransformer, util |
|
|
import requests |
|
|
import os |
|
|
import re |
|
|
import shutil |
|
|
from collections import Counter |
|
|
import numpy as np |
|
|
from typing import List, Dict, Tuple |
|
|
import base64 |
|
|
from dotenv import load_dotenv |
|
|
import json |
|
|
from difflib import SequenceMatcher |
|
|
import pdfplumber |
|
|
|
|
|
def get_svg_content(svg_path): |
|
|
with open(svg_path, "r", encoding="utf-8") as f: |
|
|
return f.read() |
|
|
|
|
|
plobin_logo_svg = get_svg_content("img/plobin.svg") |
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
GROK_API_KEY = os.getenv("GROK_API_KEY") |
|
|
GROK_API_BASE = "https://api.x.ai/v1" |
|
|
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") |
|
|
OPENAI_API_BASE = "https://api.openai.com/v1" |
|
|
CHROMA_DIR = "./chroma_db" |
|
|
EMBEDDING_MODEL = 'jhgan/ko-sroberta-multitask' |
|
|
|
|
|
class HighlightConfig: |
|
|
def __init__(self): |
|
|
self.color = [1.0, 1.0, 0.0] |
|
|
|
|
|
st.set_page_config( |
|
|
page_title="PLOBIN", |
|
|
page_icon="img/plobin-left-only.png", |
|
|
layout="wide", |
|
|
initial_sidebar_state="expanded" |
|
|
) |
|
|
|
|
|
st.markdown(""" |
|
|
<style> |
|
|
[data-testid="stSidebar"] { |
|
|
background: linear-gradient(180deg, |
|
|
#f9f9f9 0%, |
|
|
#f9f9f9 100%); |
|
|
box-shadow: none; |
|
|
border-right: 1px solid #ededed; |
|
|
width: 280px !important; |
|
|
} |
|
|
|
|
|
|
|
|
[data-testid="stSidebar"] h1 { |
|
|
color: white !important; |
|
|
font-weight: 900 !important; |
|
|
text-shadow: |
|
|
0 0 30px rgba(255,255,255,0.6), |
|
|
0 0 50px rgba(102,126,234,0.4), |
|
|
3px 3px 40px rgba(0,0,0,0.4); |
|
|
animation: sidebarTitlePulse 4s ease-in-out infinite; |
|
|
letter-spacing: 2px; |
|
|
} |
|
|
|
|
|
@keyframes sidebarTitlePulse { |
|
|
0%, 100% { |
|
|
transform: scale(1); |
|
|
text-shadow: |
|
|
0 0 30px rgba(255,255,255,0.6), |
|
|
0 0 50px rgba(102,126,234,0.4), |
|
|
3px 3px 40px rgba(0,0,0,0.4); |
|
|
} |
|
|
50% { |
|
|
transform: scale(1.03); |
|
|
text-shadow: |
|
|
0 0 40px rgba(255,255,255,0.8), |
|
|
0 0 70px rgba(102,126,234,0.6), |
|
|
0 0 100px rgba(118,75,162,0.4), |
|
|
3px 3px 40px rgba(0,0,0,0.4); |
|
|
} |
|
|
} |
|
|
|
|
|
[data-testid="stSidebar"] [data-testid="stFileUploader"] { |
|
|
background: rgba(198,198,198,0.15); |
|
|
border-radius: 15px; |
|
|
padding: 1.5rem; |
|
|
border: 1.5px dashed rgba(198,198,198,0.4); |
|
|
transition: all 0.3s ease; |
|
|
backdrop-filter: blur(10px); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
[data-testid="stFileUploader"] > section { |
|
|
background: transparent !important; |
|
|
} |
|
|
|
|
|
[data-testid="stFileUploader"] > section > div { |
|
|
background: transparent !important; |
|
|
} |
|
|
|
|
|
[data-testid="stFileUploader"] [data-testid="stMarkdownContainer"] { |
|
|
color: #c6c6c6; |
|
|
} |
|
|
|
|
|
/* ์ฌ์ฉ์ ๋ฉ์์ง ์์ด์ฝ ๋ณ๊ฒฝ */ |
|
|
[data-testid="stChatMessage"][data-testid="user"] |
|
|
[data-testid="chat-message-avatar"] img { |
|
|
content: url("https://your-image-url.com/user-icon.png") !important; |
|
|
} |
|
|
|
|
|
[data-testid="stSidebar"] [data-testid="stFileUploader"] > section, |
|
|
[data-testid="stSidebar"] [data-testid="stFileUploader"] section > div { |
|
|
background: transparent !important; |
|
|
border: none !important; |
|
|
} |
|
|
|
|
|
[data-testid="stSidebar"] [data-testid="stFileUploader"] [data-testid="stMarkdownContainer"] p { |
|
|
color: #555555 !important; |
|
|
} |
|
|
|
|
|
[data-testid="stSidebar"] [data-testid="stFileUploader"] button[kind="secondary"] { |
|
|
background: rgba(127,128,134,0.2) !important; |
|
|
color: #8A8A8A !important; |
|
|
border: 1px solid rgba(127,128,134,0.3) !important; |
|
|
} |
|
|
|
|
|
[data-testid="stSidebar"] .stButton button { |
|
|
background: rgba(127,128,134,0.15) !important; |
|
|
color: #555555 !important; |
|
|
border: 2px solid rgba(127,128,134,0.4) !important; |
|
|
border-radius: 12px !important; |
|
|
font-weight: 700 !important; |
|
|
padding: 0.75rem 1.5rem !important; |
|
|
backdrop-filter: blur(10px) !important; |
|
|
transition: all 0.3s ease !important; |
|
|
box-shadow: 0 4px 15px rgba(0,0,0,0.1) !important; |
|
|
} |
|
|
|
|
|
[data-testid="stSidebar"] .stButton button:hover { |
|
|
background: rgba(255, 36, 36,0.25) !important; |
|
|
border-color: rgba(255, 36, 36,0.6) !important; |
|
|
transform: translateY(-2px) scale(1.02) !important; |
|
|
} |
|
|
|
|
|
[data-testid="stSidebar"] .stButton button:active { |
|
|
transform: translateY(0px) scale(0.98) !important; |
|
|
} |
|
|
|
|
|
[data-testid="stSidebar"] .stButton button[kind="primary"] { |
|
|
background: rgba(255,255,255,0.25) !important; |
|
|
border: 2px solid rgba(255,255,255,0.5) !important; |
|
|
font-size: 1.05rem !important; |
|
|
} |
|
|
|
|
|
[data-testid="stSidebar"] .stButton button[kind="primary"]:hover { |
|
|
background: rgba(255,255,255,0.35) !important; |
|
|
border-color: rgba(255,255,255,0.7) !important; |
|
|
} |
|
|
|
|
|
[data-testid="stSidebar"] [data-testid="stAlert"] { |
|
|
background-color: #f2f2f2 !important; |
|
|
border-radius: 0.5rem !important; |
|
|
} |
|
|
|
|
|
|
|
|
} |
|
|
[data-testid="stSidebar"] [data-testid="stFileUploader"] button { |
|
|
display: block; |
|
|
} |
|
|
|
|
|
/* ์ฌ์ด๋๋ฐ ์ ๊ธฐ/ํผ์น๊ธฐ ๋ฒํผ ํญ์ ๋ณด์ด๊ฒ */ |
|
|
[data-testid="stSidebarCollapseButton"] { |
|
|
opacity: 1 !important; |
|
|
visibility: visible !important; |
|
|
transition: opacity 0.2s ease !important; |
|
|
} |
|
|
|
|
|
/* ํธ๋ฒ ํจ๊ณผ ์์ ๊ณ ํญ์ ๋ฐ๊ฒ */ |
|
|
[data-testid="stSidebarCollapseButton"]:hover { |
|
|
opacity: 1 !important; |
|
|
} |
|
|
|
|
|
[data-testid="stAlert"] p { |
|
|
color: #747474; |
|
|
} |
|
|
|
|
|
/* ์ฌ์ด๋๋ฐ Alert ์ ์ฒด ๋ฐ์ค ๊ฐ์ ์คํ์ผ */ |
|
|
[data-testid="stSidebar"] [data-testid="stAlert"] { |
|
|
background-color: #f2f2f2 !important; /* ์ํ๋ ๋ฐฐ๊ฒฝ์ */ |
|
|
border-radius: 0.5rem !important; |
|
|
} |
|
|
|
|
|
/* Alert ๋ด๋ถ ์ปจํ
์ด๋๊น์ง ๊ฐ์ ๋ก ์์ ์ ์ฉ */ |
|
|
[data-testid="stSidebar"] [data-testid="stAlert"] > div { |
|
|
background-color: #f2f2f2 !important; |
|
|
} |
|
|
|
|
|
/* ๊ฐ์ฅ ์์ชฝ Alert message ๋ฐ์ค */ |
|
|
[data-testid="stSidebar"] [data-testid="stAlert"] [role="alert"] { |
|
|
background-color: #f2f2f2 !important; |
|
|
} |
|
|
|
|
|
|
|
|
.main .block-container { |
|
|
max-width: 100%; |
|
|
padding-left: 2rem; |
|
|
padding-right: 2rem; |
|
|
} |
|
|
|
|
|
.plobin-header { |
|
|
padding: 1.5rem 2rem; |
|
|
margin-bottom: 2rem; |
|
|
} |
|
|
|
|
|
.plobin-logo { |
|
|
display: block; |
|
|
margin: 0 auto; |
|
|
height: 60px; |
|
|
} |
|
|
|
|
|
.plobin-title { |
|
|
font-size: 2.5rem; |
|
|
font-weight: bold; |
|
|
color: white; |
|
|
margin: 0; |
|
|
text-align: center; |
|
|
text-shadow: 2px 2px 8px rgba(0, 0, 0, 0.4), |
|
|
0 0 20px rgba(102, 126, 234, 0.4); |
|
|
} |
|
|
|
|
|
.plobin-subtitle { |
|
|
font-size: 1rem; |
|
|
color: rgba(255, 255, 255, 0.9); |
|
|
text-align: center; |
|
|
margin-top: 0.5rem; |
|
|
text-shadow: 1px 1px 6px rgba(0, 0, 0, 0.4); |
|
|
} |
|
|
|
|
|
[data-testid="stFileUploader"] { |
|
|
background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%); |
|
|
border: 3px dashed #667eea; |
|
|
border-radius: 1rem; |
|
|
padding: 3rem 2rem; |
|
|
} |
|
|
|
|
|
[data-testid="stFileUploader"] > div { |
|
|
text-align: center; |
|
|
} |
|
|
|
|
|
[data-testid="stFileUploader"] label { |
|
|
font-size: 1.2rem !important; |
|
|
color: #2D3748 !important; |
|
|
font-weight: 600 !important; |
|
|
} |
|
|
|
|
|
.pdf-container { |
|
|
border: 2px solid #E2E8F0; |
|
|
border-radius: 0.5rem; |
|
|
padding: 0.5rem; |
|
|
height: 706px; |
|
|
overflow-y: auto; |
|
|
background: white; |
|
|
} |
|
|
|
|
|
.chat-container { |
|
|
border: 2px solid #E2E8F0; |
|
|
border-radius: 0.5rem; |
|
|
padding: 1rem; |
|
|
height: 650px; |
|
|
overflow-y: auto; |
|
|
background: white; |
|
|
margin-bottom: 0.5rem; |
|
|
} |
|
|
|
|
|
[data-testid="stChatInput"] { |
|
|
margin-top: 0 !important; |
|
|
padding-top: 0 !important; |
|
|
} |
|
|
|
|
|
.source-box { |
|
|
background: #F1F5F9; |
|
|
padding: 1rem; |
|
|
border-radius: 0.5rem; |
|
|
margin: 0.5rem 0; |
|
|
border-left: 3px solid #667eea; |
|
|
} |
|
|
|
|
|
.source-title { |
|
|
font-weight: bold; |
|
|
color: #667eea; |
|
|
margin-bottom: 0.5rem; |
|
|
} |
|
|
|
|
|
.page-indicator { |
|
|
background: #667eea; |
|
|
color: white; |
|
|
padding: 0.3rem 0.8rem; |
|
|
border-radius: 1rem; |
|
|
font-size: 0.85rem; |
|
|
display: inline-block; |
|
|
margin: 0.2rem; |
|
|
} |
|
|
|
|
|
.highlight-indicator { |
|
|
background: #FEF08A; |
|
|
color: #854D0E; |
|
|
padding: 0.5rem 1rem; |
|
|
border-radius: 0.5rem; |
|
|
margin: 0.5rem 0; |
|
|
font-weight: bold; |
|
|
border-left: 4px solid #EAB308; |
|
|
} |
|
|
|
|
|
.usage-guide { |
|
|
background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%); |
|
|
padding: 2rem; |
|
|
border-radius: 1rem; |
|
|
margin-bottom: 2rem; |
|
|
height: 100%; |
|
|
} |
|
|
|
|
|
.guide-step { |
|
|
display: flex; |
|
|
align-items: center; |
|
|
margin: 1.5rem 0; |
|
|
margin-left: 3.5rem; |
|
|
font-size: 1.1rem; |
|
|
color: #2D3748; |
|
|
} |
|
|
|
|
|
.step-number { |
|
|
background: #667eea; |
|
|
color: white; |
|
|
width: 2.5rem; |
|
|
height: 2.5rem; |
|
|
border-radius: 50%; |
|
|
display: flex; |
|
|
align-items: center; |
|
|
justify-content: center; |
|
|
font-weight: bold; |
|
|
font-size: 1.2rem; |
|
|
margin-right: 1rem; |
|
|
flex-shrink: 0; |
|
|
} |
|
|
|
|
|
.viewer-header { |
|
|
display: flex; |
|
|
justify-content: space-between; |
|
|
align-items: center; |
|
|
margin-bottom: 1rem; |
|
|
} |
|
|
|
|
|
@keyframes pulse { |
|
|
0%, 100% { |
|
|
box-shadow: 0 0 0 0 rgba(16, 185, 129, 0.7); |
|
|
} |
|
|
50% { |
|
|
box-shadow: 0 0 20px 10px rgba(16, 185, 129, 0); |
|
|
} |
|
|
} |
|
|
|
|
|
.chat-title { |
|
|
color: black !important; |
|
|
font-weight: 900 !important; |
|
|
font-size: 1.75rem !important; |
|
|
margin-bottom: 1rem !important; |
|
|
text-shadow: |
|
|
0 0 30px rgba(255,255,255,0.6), |
|
|
0 0 50px rgba(102,126,234,0.4), |
|
|
3px 3px 40px rgba(0,0,0,0.4); |
|
|
letter-spacing: 2px; |
|
|
} |
|
|
|
|
|
[data-testid="column"] button[kind="secondary"] { |
|
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important; |
|
|
color: white !important; |
|
|
border: none !important; |
|
|
border-radius: 0.5rem !important; |
|
|
padding: 0.6rem 1rem !important; |
|
|
font-weight: bold !important; |
|
|
font-size: 0.95rem !important; |
|
|
text-align: left !important; |
|
|
box-shadow: 0 2px 4px rgba(0,0,0,0.1) !important; |
|
|
transition: all 0.2s ease !important; |
|
|
cursor: pointer !important; |
|
|
} |
|
|
|
|
|
[data-testid="column"] button[kind="secondary"]:hover { |
|
|
transform: translateY(-2px) !important; |
|
|
box-shadow: 0 4px 8px rgba(102, 126, 234, 0.3) !important; |
|
|
background: linear-gradient(135deg, #7c8ff5 0%, #8a5db8 100%) !important; |
|
|
} |
|
|
|
|
|
[data-testid="column"] button[kind="primary"] { |
|
|
background: linear-gradient(135deg, #FEF08A 0%, #FDE047 100%) !important; |
|
|
color: #854D0E !important; |
|
|
border: 2px solid #EAB308 !important; |
|
|
border-radius: 0.5rem !important; |
|
|
padding: 0.6rem 1rem !important; |
|
|
font-weight: bold !important; |
|
|
font-size: 0.95rem !important; |
|
|
text-align: left !important; |
|
|
box-shadow: 0 2px 4px rgba(0,0,0,0.1) !important; |
|
|
transition: all 0.2s ease !important; |
|
|
cursor: pointer !important; |
|
|
} |
|
|
|
|
|
[data-testid="column"] button[kind="primary"]:hover { |
|
|
transform: translateY(-2px) !important; |
|
|
box-shadow: 0 4px 8px rgba(234, 179, 8, 0.3) !important; |
|
|
background: linear-gradient(135deg, #FDE047 0%, #FACC15 100%) !important; |
|
|
} |
|
|
|
|
|
/* ์ฑํ
์
๋ ฅ์ฐฝ ํฌ์ปค์ค ์ ํ
๋๋ฆฌ ์์ ๋ณ๊ฒฝ */ |
|
|
[data-testid="stChatInput"] textarea:focus { |
|
|
border-color: #3f3f3f !important; |
|
|
box-shadow: 0 0 0 1px #3f3f3f !important; |
|
|
} |
|
|
|
|
|
/* ์ฑํ
์
๋ ฅ์ฐฝ ๊ธฐ๋ณธ ์ํ */ |
|
|
[data-testid="stChatInput"] textarea { |
|
|
border-color: #3f3f3f; !important; |
|
|
transition: border-color 0.2s ease; |
|
|
} |
|
|
|
|
|
/* ํธ๋ฒ ์ํ */ |
|
|
[data-testid="stChatInput"] textarea:hover { |
|
|
border-color: #3f3f3f; !important; |
|
|
} |
|
|
|
|
|
/* Streamlit ๊ธฐ๋ณธ ์๋ฐํ ์จ๊ธฐ๊ธฐ */ |
|
|
[data-testid="stChatMessage"][data-testid="user"] |
|
|
[data-testid="chat-message-avatar"] img { |
|
|
display: none !important; |
|
|
} |
|
|
|
|
|
/* ์ํ๋ ์์ด์ฝ์ผ๋ก ๊ต์ฒด */ |
|
|
[data-testid="stChatMessage"][data-testid="user"] |
|
|
[data-testid="chat-message-avatar"] { |
|
|
background-image: "final/img/user-profile.png"; |
|
|
background-size: cover; |
|
|
background-position: center; |
|
|
width: 36px !important; |
|
|
height: 36px !important; |
|
|
border-radius: 50%; /* ์ํ */ |
|
|
} |
|
|
|
|
|
/* ๊ธฐ๋ณธ ์๋ฐํ ์ ๊ฑฐ */ |
|
|
[data-testid="stChatMessage"][data-testid="assistant"] |
|
|
[data-testid="chat-message-avatar"] img { |
|
|
display: none !important; |
|
|
} |
|
|
|
|
|
/* ์ปค์คํ
์์ด์ฝ ์ง์ */ |
|
|
[data-testid="stChatMessage"][data-testid="assistant"] |
|
|
[data-testid="chat-message-avatar"] { |
|
|
background-image: "final/img/cloud.png; |
|
|
background-size: cover; |
|
|
background-position: center; |
|
|
width: 36px !important; |
|
|
height: 36px !important; |
|
|
border-radius: 50%; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
</style> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
SPACE_RE = re.compile(r'\s+') |
|
|
|
|
|
def normalize_for_search(text: str) -> str: |
|
|
""" |
|
|
๊ฒ์/๋งค์นญ์ฉ ํ
์คํธ ์ ๊ทํ: |
|
|
- ์๋ ๊ณต๋ฐฑ ์ ๊ฑฐ |
|
|
- ์๋ฌธ์ ๋ณํ |
|
|
- ๋ชจ๋ ๊ณต๋ฐฑ ๋ฌธ์ ์ ๊ฑฐ (๋์ด์ฐ๊ธฐ ์ฐจ์ด ๋ฌด์) |
|
|
""" |
|
|
text = text.strip().lower() |
|
|
text = SPACE_RE.sub('', text) |
|
|
return text |
|
|
|
|
|
|
|
|
def init_session(): |
|
|
if 'processed' not in st.session_state: |
|
|
st.session_state.processed = False |
|
|
if 'vector_db' not in st.session_state: |
|
|
st.session_state.vector_db = None |
|
|
if 'embedder' not in st.session_state: |
|
|
st.session_state.embedder = None |
|
|
if 'chat_history' not in st.session_state: |
|
|
st.session_state.chat_history = [] |
|
|
if 'doc_metadata' not in st.session_state: |
|
|
st.session_state.doc_metadata = {} |
|
|
if 'pdf_bytes' not in st.session_state: |
|
|
st.session_state.pdf_bytes = None |
|
|
if 'pdf_pages_text' not in st.session_state: |
|
|
st.session_state.pdf_pages_text = {} |
|
|
if 'current_highlights' not in st.session_state: |
|
|
st.session_state.current_highlights = [] |
|
|
if 'zoom_level' not in st.session_state: |
|
|
st.session_state.zoom_level = 2.0 |
|
|
if 'highlight_config' not in st.session_state: |
|
|
st.session_state.highlight_config = HighlightConfig() |
|
|
if 'processing_query' not in st.session_state: |
|
|
st.session_state.processing_query = None |
|
|
if 'scroll_to_page' not in st.session_state: |
|
|
st.session_state.scroll_to_page = None |
|
|
|
|
|
|
|
|
def extract_table_image_as_base64(pdf_bytes: bytes, page_num: int, bbox: tuple) -> str: |
|
|
""" |
|
|
PDF ํ์ด์ง์์ ํ ์์ญ์ ์ด๋ฏธ์ง๋ก ์ถ์ถํ์ฌ base64๋ก ์ธ์ฝ๋ฉ |
|
|
|
|
|
Args: |
|
|
pdf_bytes: PDF ๋ฐ์ดํธ ๋ฐ์ดํฐ |
|
|
page_num: ํ์ด์ง ๋ฒํธ (0๋ถํฐ ์์) |
|
|
bbox: (x0, y0, x1, y1) ํ ์์ญ ์ขํ |
|
|
|
|
|
Returns: |
|
|
base64 ์ธ์ฝ๋ฉ๋ ์ด๋ฏธ์ง ๋ฌธ์์ด |
|
|
""" |
|
|
doc = fitz.open(stream=pdf_bytes, filetype="pdf") |
|
|
page = doc[page_num] |
|
|
|
|
|
|
|
|
rect = fitz.Rect(bbox) |
|
|
pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0), clip=rect) |
|
|
img_bytes = pix.tobytes("png") |
|
|
|
|
|
doc.close() |
|
|
|
|
|
|
|
|
img_base64 = base64.b64encode(img_bytes).decode('utf-8') |
|
|
return img_base64 |
|
|
|
|
|
|
|
|
def convert_table_to_markdown_with_vision( |
|
|
pdf_bytes: bytes, |
|
|
page_num: int, |
|
|
bbox: tuple, |
|
|
api_key: str |
|
|
) -> str: |
|
|
""" |
|
|
OpenAI Vision API๋ฅผ ์ฌ์ฉํ์ฌ ํ ์ด๋ฏธ์ง๋ฅผ ๋งํฌ๋ค์ด์ผ๋ก ๋ณํ |
|
|
|
|
|
Args: |
|
|
pdf_bytes: PDF ๋ฐ์ดํธ ๋ฐ์ดํฐ |
|
|
page_num: ํ์ด์ง ๋ฒํธ |
|
|
bbox: ํ ์์ญ ์ขํ |
|
|
api_key: OpenAI API ํค |
|
|
|
|
|
Returns: |
|
|
๋งํฌ๋ค์ด ํ์์ ํ |
|
|
""" |
|
|
|
|
|
img_base64 = extract_table_image_as_base64(pdf_bytes, page_num, bbox) |
|
|
|
|
|
|
|
|
prompt = """์ด ์ด๋ฏธ์ง๋ PDF ๋ฌธ์์ ํ์
๋๋ค. |
|
|
ํ์ ๋ด์ฉ์ ์ ํํ๊ฒ ๋งํฌ๋ค์ด ํ ํ์์ผ๋ก ๋ณํํด์ฃผ์ธ์. |
|
|
|
|
|
๊ท์น: |
|
|
1. ์
๋ณํฉ์ด ์์ผ๋ฉด ์ ์ ํ ์ฒ๋ฆฌ |
|
|
2. ์ค์ฒฉ๋ ํ๊ฐ ์์ผ๋ฉด ํ
์คํธ๋ก ํํ |
|
|
3. ๋น ์
์ ๋น ์นธ์ผ๋ก ์ ์ง |
|
|
4. ํ ํ์๋ง ๋ฐํ (์ถ๊ฐ ์ค๋ช
์์ด) |
|
|
|
|
|
๋งํฌ๋ค์ด ํ ํ์: |
|
|
| ์ด1 | ์ด2 | ์ด3 | |
|
|
| --- | --- | --- | |
|
|
| ๋ฐ์ดํฐ1 | ๋ฐ์ดํฐ2 | ๋ฐ์ดํฐ3 |""" |
|
|
|
|
|
try: |
|
|
response = requests.post( |
|
|
f"{OPENAI_API_BASE}/chat/completions", |
|
|
headers={ |
|
|
"Authorization": f"Bearer {api_key}", |
|
|
"Content-Type": "application/json" |
|
|
}, |
|
|
json={ |
|
|
"model": "gpt-4o", |
|
|
"messages": [ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{ |
|
|
"type": "text", |
|
|
"text": prompt |
|
|
}, |
|
|
{ |
|
|
"type": "image_url", |
|
|
"image_url": { |
|
|
"url": f"data:image/png;base64,{img_base64}", |
|
|
"detail": "high" |
|
|
} |
|
|
} |
|
|
] |
|
|
} |
|
|
], |
|
|
"temperature": 0.1, |
|
|
"max_tokens": 2000 |
|
|
}, |
|
|
timeout=120 |
|
|
) |
|
|
|
|
|
if response.status_code == 200: |
|
|
result = response.json() |
|
|
markdown_table = result['choices'][0]['message']['content'] |
|
|
|
|
|
|
|
|
markdown_table = re.sub(r'```markdown\s*|\s*```', '', markdown_table) |
|
|
markdown_table = re.sub(r'```\s*|\s*```', '', markdown_table) |
|
|
|
|
|
return markdown_table.strip() |
|
|
else: |
|
|
|
|
|
error_detail = response.text |
|
|
print(f"OpenAI API ์ค๋ฅ: {response.status_code}") |
|
|
print(f"์์ธ: {error_detail}") |
|
|
return f"[ํ ๋ณํ ์คํจ: {response.status_code} - {error_detail[:200]}]" |
|
|
|
|
|
except Exception as e: |
|
|
return f"[ํ ๋ณํ ์คํจ: {str(e)}]" |
|
|
|
|
|
def extract_text_from_pdf(pdf_file) -> Tuple[List[str], List[Dict], bytes, Dict]: |
|
|
""" |
|
|
PDF์์ ํ
์คํธ์ ํ๋ฅผ ์ถ์ถ (ํ๋ Grok Vision API๋ก ์ฒ๋ฆฌ) |
|
|
""" |
|
|
pdf_bytes = pdf_file.read() |
|
|
doc = fitz.open(stream=pdf_bytes, filetype="pdf") |
|
|
|
|
|
chunks = [] |
|
|
metadata_list = [] |
|
|
pages_text = {} |
|
|
|
|
|
CHUNK_SIZE = 800 |
|
|
OVERLAP_SIZE = 150 |
|
|
|
|
|
|
|
|
pdf_file.seek(0) |
|
|
|
|
|
with pdfplumber.open(pdf_file) as pdf_plumber: |
|
|
for page_num in range(len(doc)): |
|
|
|
|
|
fitz_page = doc[page_num] |
|
|
text = fitz_page.get_text("text") |
|
|
|
|
|
|
|
|
tables_markdown = [] |
|
|
if page_num < len(pdf_plumber.pages): |
|
|
plumber_page = pdf_plumber.pages[page_num] |
|
|
|
|
|
|
|
|
table_settings = { |
|
|
"vertical_strategy": "lines", |
|
|
"horizontal_strategy": "lines", |
|
|
"snap_tolerance": 3, |
|
|
"join_tolerance": 3, |
|
|
} |
|
|
|
|
|
tables = plumber_page.find_tables(table_settings=table_settings) |
|
|
|
|
|
|
|
|
for idx, table in enumerate(tables): |
|
|
bbox = table.bbox |
|
|
|
|
|
|
|
|
markdown_table = convert_table_to_markdown_with_vision( |
|
|
pdf_bytes, |
|
|
page_num, |
|
|
bbox, |
|
|
OPENAI_API_KEY |
|
|
) |
|
|
|
|
|
tables_markdown.append(f"\n\n**[ํ {idx + 1}]**\n{markdown_table}\n") |
|
|
|
|
|
|
|
|
combined_content = text |
|
|
if tables_markdown: |
|
|
combined_content += "\n\n" + "\n".join(tables_markdown) |
|
|
|
|
|
pages_text[page_num + 1] = combined_content |
|
|
|
|
|
if not combined_content.strip(): |
|
|
continue |
|
|
|
|
|
|
|
|
lines = [line.strip() for line in combined_content.split('\n') if line.strip()] |
|
|
cleaned_text = '\n'.join(lines) |
|
|
|
|
|
|
|
|
if "**[ํ" in cleaned_text: |
|
|
|
|
|
table_pattern = r'\*\*\[ํ \d+\]\*\*' |
|
|
parts = re.split(f'({table_pattern})', cleaned_text) |
|
|
|
|
|
current_chunk = "" |
|
|
for part in parts: |
|
|
part = part.strip() |
|
|
if not part: |
|
|
continue |
|
|
|
|
|
|
|
|
if re.match(table_pattern, part): |
|
|
if current_chunk: |
|
|
chunks.append(current_chunk.strip()) |
|
|
metadata_list.append({ |
|
|
"page": page_num + 1, |
|
|
"source": pdf_file.name, |
|
|
"chunk_type": "text" |
|
|
}) |
|
|
current_chunk = "" |
|
|
current_chunk = part |
|
|
else: |
|
|
|
|
|
if current_chunk and re.match(table_pattern, current_chunk): |
|
|
|
|
|
current_chunk += "\n" + part |
|
|
chunks.append(current_chunk.strip()) |
|
|
metadata_list.append({ |
|
|
"page": page_num + 1, |
|
|
"source": pdf_file.name, |
|
|
"chunk_type": "table" |
|
|
}) |
|
|
current_chunk = "" |
|
|
else: |
|
|
|
|
|
if len(current_chunk) + len(part) > CHUNK_SIZE: |
|
|
if current_chunk: |
|
|
chunks.append(current_chunk.strip()) |
|
|
metadata_list.append({ |
|
|
"page": page_num + 1, |
|
|
"source": pdf_file.name, |
|
|
"chunk_type": "text" |
|
|
}) |
|
|
current_chunk = part |
|
|
else: |
|
|
current_chunk += "\n" + part if current_chunk else part |
|
|
|
|
|
if current_chunk: |
|
|
chunk_type = "table" if re.match(table_pattern, current_chunk) else "text" |
|
|
chunks.append(current_chunk.strip()) |
|
|
metadata_list.append({ |
|
|
"page": page_num + 1, |
|
|
"source": pdf_file.name, |
|
|
"chunk_type": chunk_type |
|
|
}) |
|
|
else: |
|
|
|
|
|
sentences = re.split(r'([.!?]\s+|\n{2,})', cleaned_text) |
|
|
sentences = [s for s in sentences if s.strip()] |
|
|
|
|
|
current_chunk = "" |
|
|
current_length = 0 |
|
|
|
|
|
for sentence in sentences: |
|
|
sentence_length = len(sentence) |
|
|
|
|
|
if current_length + sentence_length > CHUNK_SIZE and current_chunk: |
|
|
chunks.append(current_chunk.strip()) |
|
|
metadata_list.append({ |
|
|
"page": page_num + 1, |
|
|
"source": pdf_file.name, |
|
|
"chunk_type": "text" |
|
|
}) |
|
|
|
|
|
overlap_text = current_chunk[-OVERLAP_SIZE:] if len(current_chunk) > OVERLAP_SIZE else current_chunk |
|
|
current_chunk = overlap_text + sentence |
|
|
current_length = len(current_chunk) |
|
|
else: |
|
|
current_chunk += sentence |
|
|
current_length += sentence_length |
|
|
|
|
|
if current_chunk.strip(): |
|
|
chunks.append(current_chunk.strip()) |
|
|
metadata_list.append({ |
|
|
"page": page_num + 1, |
|
|
"source": pdf_file.name, |
|
|
"chunk_type": "text" |
|
|
}) |
|
|
|
|
|
doc.close() |
|
|
return chunks, metadata_list, pdf_bytes, pages_text |
|
|
|
|
|
|
|
|
def save_extracted_text_to_file(chunks: List[str], metadata_list: List[Dict], filename: str): |
|
|
""" |
|
|
์ถ์ถํ ํ
์คํธ๋ฅผ ๋ก์ปฌ ํ์ผ๋ก ์ ์ฅ |
|
|
""" |
|
|
import os |
|
|
from datetime import datetime |
|
|
|
|
|
|
|
|
output_dir = "extracted_text" |
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
base_name = os.path.splitext(filename)[0] |
|
|
output_file = os.path.join(output_dir, f"{base_name}_{timestamp}.txt") |
|
|
|
|
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f: |
|
|
f.write(f"=" * 80 + "\n") |
|
|
f.write(f"๋ฌธ์๋ช
: {filename}\n") |
|
|
f.write(f"์ถ์ถ ์๊ฐ: {timestamp}\n") |
|
|
f.write(f"์ด ์ฒญํฌ ์: {len(chunks)}\n") |
|
|
f.write(f"=" * 80 + "\n\n") |
|
|
|
|
|
for idx, (chunk, meta) in enumerate(zip(chunks, metadata_list), 1): |
|
|
f.write(f"\n{'='*80}\n") |
|
|
f.write(f"์ฒญํฌ #{idx}\n") |
|
|
f.write(f"ํ์ด์ง: {meta.get('page', 'N/A')}\n") |
|
|
f.write(f"ํ์
: {meta.get('chunk_type', 'text')}\n") |
|
|
f.write(f"{'-'*80}\n") |
|
|
f.write(chunk) |
|
|
f.write(f"\n{'='*80}\n") |
|
|
|
|
|
return output_file |
|
|
|
|
|
@st.cache_resource(show_spinner=False) |
|
|
def load_embedding_model(): |
|
|
return SentenceTransformer(EMBEDDING_MODEL) |
|
|
|
|
|
|
|
|
def create_vector_db(chunks: List[str], metadata_list: List[Dict]): |
|
|
embedder = load_embedding_model() |
|
|
|
|
|
client = chromadb.EphemeralClient( |
|
|
settings=chromadb.Settings( |
|
|
anonymized_telemetry=False, |
|
|
allow_reset=True |
|
|
) |
|
|
) |
|
|
|
|
|
try: |
|
|
client.delete_collection("rfx_docs") |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
collection = client.create_collection( |
|
|
name="rfx_docs", |
|
|
metadata={"hnsw:space": "cosine"} |
|
|
) |
|
|
|
|
|
batch_size = 32 |
|
|
all_embeddings = [] |
|
|
|
|
|
for i in range(0, len(chunks), batch_size): |
|
|
batch = chunks[i:i + batch_size] |
|
|
embeddings = embedder.encode(batch, show_progress_bar=False, convert_to_numpy=True) |
|
|
all_embeddings.extend(embeddings) |
|
|
|
|
|
ids = [f"doc_{i}" for i in range(len(chunks))] |
|
|
collection.add( |
|
|
embeddings=[emb.tolist() for emb in all_embeddings], |
|
|
documents=chunks, |
|
|
metadatas=metadata_list, |
|
|
ids=ids |
|
|
) |
|
|
|
|
|
return collection, embedder |
|
|
|
|
|
|
|
|
def extract_keywords_semantic(text: str, embedder, top_n: int = 5) -> List[str]: |
|
|
words_with_numbers = re.findall(r'[๊ฐ-ํฃ]*\d+[๊ฐ-ํฃ]*', text) |
|
|
candidate_words = re.findall(r'[๊ฐ-ํฃ]{2,}', text) |
|
|
|
|
|
if not candidate_words: |
|
|
return words_with_numbers[:top_n] |
|
|
|
|
|
word_freq = Counter(candidate_words) |
|
|
text_embedding = embedder.encode([text], convert_to_numpy=True)[0] |
|
|
word_embeddings = embedder.encode(list(word_freq.keys()), convert_to_numpy=True) |
|
|
similarities = util.cos_sim(text_embedding, word_embeddings)[0].numpy() |
|
|
|
|
|
scored_words = [] |
|
|
for idx, (word, freq) in enumerate(word_freq.items()): |
|
|
semantic_score = similarities[idx] |
|
|
frequency_score = np.log1p(freq) / 10.0 |
|
|
combined_score = 0.7 * semantic_score + 0.3 * frequency_score |
|
|
scored_words.append((word, combined_score)) |
|
|
|
|
|
scored_words.sort(key=lambda x: x[1], reverse=True) |
|
|
|
|
|
result = [] |
|
|
for word in words_with_numbers[:3]: |
|
|
if word and word not in result: |
|
|
result.append(word) |
|
|
|
|
|
for word, score in scored_words: |
|
|
if word not in result: |
|
|
result.append(word) |
|
|
if len(result) >= top_n: |
|
|
break |
|
|
|
|
|
return result[:top_n] |
|
|
|
|
|
|
|
|
def hybrid_search(query: str, collection, embedder, top_k: int = 3) -> Dict: |
|
|
query_embedding = embedder.encode([query], convert_to_numpy=True)[0] |
|
|
vector_results = collection.query( |
|
|
query_embeddings=[query_embedding.tolist()], |
|
|
n_results=20, |
|
|
include=["documents", "metadatas", "distances"] |
|
|
) |
|
|
|
|
|
keywords = extract_keywords_semantic(query, embedder, top_n=5) |
|
|
|
|
|
hybrid_results = [] |
|
|
for i, doc_id in enumerate(vector_results['ids'][0]): |
|
|
doc = vector_results['documents'][0][i] |
|
|
metadata = vector_results['metadatas'][0][i] |
|
|
vector_score = 1 - vector_results['distances'][0][i] |
|
|
|
|
|
keyword_score = 0 |
|
|
|
|
|
|
|
|
doc_lower = doc.lower() |
|
|
doc_norm = normalize_for_search(doc) |
|
|
|
|
|
for keyword in keywords: |
|
|
kw_lower = keyword.lower() |
|
|
kw_norm = normalize_for_search(keyword) |
|
|
|
|
|
|
|
|
|
|
|
if kw_lower in doc_lower or kw_norm in doc_norm: |
|
|
keyword_score += 1 |
|
|
|
|
|
keyword_score = keyword_score / len(keywords) if keywords else 0 |
|
|
hybrid_score = 0.7 * vector_score + 0.3 * keyword_score |
|
|
|
|
|
hybrid_results.append({ |
|
|
'id': doc_id, |
|
|
'document': doc, |
|
|
'metadata': metadata, |
|
|
'hybrid_score': hybrid_score, |
|
|
'vector_score': vector_score, |
|
|
'keyword_score': keyword_score |
|
|
}) |
|
|
|
|
|
hybrid_results.sort(key=lambda x: x['hybrid_score'], reverse=True) |
|
|
top_results = hybrid_results[:top_k] |
|
|
|
|
|
return { |
|
|
'documents': [[r['document'] for r in top_results]], |
|
|
'metadatas': [[r['metadata'] for r in top_results]], |
|
|
'scores': [r['hybrid_score'] for r in top_results], |
|
|
'keywords': keywords |
|
|
} |
|
|
|
|
|
|
|
|
def grok_verify_and_extract(query: str, search_results: Dict, api_key: str) -> Dict: |
|
|
docs = search_results['documents'][0] |
|
|
metas = search_results['metadatas'][0] |
|
|
|
|
|
formatted_docs = [] |
|
|
for i, (doc, meta) in enumerate(zip(docs, metas), 1): |
|
|
formatted_docs.append(f"[๋ฌธ์ {i}] (ํ์ด์ง {meta['page']})\n{doc}") |
|
|
|
|
|
context = "\n\n".join(formatted_docs) |
|
|
|
|
|
system_prompt = """๋น์ ์ RFx ๋ฌธ์ ๋ถ์ ์ ๋ฌธ๊ฐ์
๋๋ค. |
|
|
์ฃผ์ด์ง 3๊ฐ์ ๋ฌธ์ ์ค์์ ์ฌ์ฉ์ ์ง๋ฌธ๊ณผ **๊ฐ์ฅ ๊ด๋ จ ์๋ ๋จ 1๊ฐ์ ํต์ฌ ์ ๋ณด**๋ง ์ ํํ์ธ์. |
|
|
|
|
|
**์ค์ ๊ท์น:** |
|
|
1. ๋ฐ๋์ **1๊ฐ์ ํ
์คํธ**๋ง ์ถ์ถ |
|
|
2. ๊ฐ์ฅ ์ง์ ์ ์ผ๋ก ์ง๋ฌธ์ ๋ตํ๋ ์ ๋ณด ์ ํ |
|
|
3. ๊ธ์ก, ๋ ์ง, ์๋ ๋ฑ ๊ตฌ์ฒด์ ์ธ ์ซ์ ์ ๋ณด ์ฐ์ |
|
|
4. ์ถ์ถ๋ ํ
์คํธ๋ ์๋ฌธ ๊ทธ๋๋ก ์ ์ง (150์ ์ด๋ด) |
|
|
5. JSON ํ์์ผ๋ก๋ง ์๋ต |
|
|
|
|
|
**์๋ต ํ์:** |
|
|
{ |
|
|
"selected_text": "์ ํ๋ ํ
์คํธ (์๋ฌธ ๊ทธ๋๋ก)", |
|
|
"page": ํ์ด์ง๋ฒํธ, |
|
|
"relevance_reason": "์ด ํ
์คํธ๋ฅผ ์ ํํ ์ด์ " |
|
|
}""" |
|
|
|
|
|
user_prompt = f"""<์ง๋ฌธ> |
|
|
{query} |
|
|
</์ง๋ฌธ> |
|
|
|
|
|
<๊ฒ์๋ ๋ฌธ์๋ค> |
|
|
{context} |
|
|
</๊ฒ์๋ ๋ฌธ์๋ค> |
|
|
|
|
|
์ 3๊ฐ ๋ฌธ์์์ ์ง๋ฌธ์ ๊ฐ์ฅ ์ ํํ๊ฒ ๋ตํ๋ **๋จ 1๊ฐ์ ํต์ฌ ์ ๋ณด**๋ฅผ JSON ํ์์ผ๋ก ์ ํํ์ธ์. |
|
|
์ ํํ ํ
์คํธ๋ 150์ ์ด๋ด๋ก ํ์ธ์.""" |
|
|
|
|
|
headers = { |
|
|
"Content-Type": "application/json", |
|
|
"Authorization": f"Bearer {api_key}" |
|
|
} |
|
|
|
|
|
payload = { |
|
|
"model": "grok-3", |
|
|
"messages": [ |
|
|
{"role": "system", "content": system_prompt}, |
|
|
{"role": "user", "content": user_prompt} |
|
|
], |
|
|
"temperature": 0.1, |
|
|
"max_tokens": 1000, |
|
|
"stream": False |
|
|
} |
|
|
|
|
|
try: |
|
|
response = requests.post( |
|
|
f"{GROK_API_BASE}/chat/completions", |
|
|
headers=headers, |
|
|
json=payload, |
|
|
timeout=120 |
|
|
) |
|
|
|
|
|
if response.status_code != 200: |
|
|
return {"error": f"API ์ค๋ฅ: {response.status_code}"} |
|
|
|
|
|
result = response.json() |
|
|
content = result["choices"][0]["message"]["content"] |
|
|
content = content.replace("```json", "").replace("```", "").strip() |
|
|
extracted_data = json.loads(content) |
|
|
|
|
|
return extracted_data |
|
|
|
|
|
except Exception as e: |
|
|
return {"error": f"์ค๋ฅ: {str(e)}"} |
|
|
|
|
|
|
|
|
def build_context(search_results: Dict, max_length: int = 3000) -> str: |
|
|
context_parts = [] |
|
|
current_length = 0 |
|
|
|
|
|
docs = search_results['documents'][0] |
|
|
metas = search_results['metadatas'][0] |
|
|
|
|
|
for i, (doc, meta) in enumerate(zip(docs, metas), 1): |
|
|
part = f"[๋ฌธ์ {i}] (ํ์ด์ง {meta['page']})\n{doc}\n" |
|
|
part_length = len(part) |
|
|
|
|
|
if current_length + part_length > max_length: |
|
|
remaining = max_length - current_length |
|
|
if remaining > 200: |
|
|
part = f"[๋ฌธ์ {i}] (ํ์ด์ง {meta['page']})\n{doc[:remaining-50]}...\n" |
|
|
context_parts.append(part) |
|
|
break |
|
|
|
|
|
context_parts.append(part) |
|
|
current_length += part_length |
|
|
|
|
|
return "\n".join(context_parts) |
|
|
|
|
|
|
|
|
def generate_answer(query: str, search_results: Dict, api_key: str) -> str: |
|
|
context = build_context(search_results, max_length=4000) |
|
|
|
|
|
system_prompt = """๋น์ ์ ์๋์ฐจ ์ ์กฐ์
RFx ๋ฌธ์ ์ ๋ฌธ ๋ถ์๊ฐ์
๋๋ค. |
|
|
**์ฐ์
ํนํ ์ง์นจ:** |
|
|
1. **์๋์ฐจ ์ ์กฐ์
์์ดยท์ฝ์ด ํด์**: ์ฌ์ฉ์์ ์ง๋ฌธ์๋ ์๋์ฐจ ์ ์กฐ์
ํน์ ์ ์์ดยท์ฝ์ดยท์ ๋ฌธ์ฉ์ด๊ฐ ํฌํจ๋ ์ ์์ผ๋ฏ๋ก ์ฐ์
๋ฌธ๋งฅ์ ๋ง๊ฒ ์ ํํ ํด์ํ๋ผ. |
|
|
2. **์ธ์ด ํผ์ฉ ๋ฐ ๋น๋ฌธ ๋์**: ์ฌ์ฉ์์ ๋ฌธ์ฅ์ ํ๊ตญ์ด์ ์์ด๊ฐ ์์ด๊ฑฐ๋ ๋ฌธ๋ฒ ์ค๋ฅ๊ฐ ์์ ์ ์์ผ๋ฏ๋ก ์๋๋ฅผ ์ถ๋ก ํ์ฌ ์ ํํ ์ดํดํ๋ผ. |
|
|
3. **๋ชจํธํ ์ง๋ฌธ ์๋ ๋ณด์ **: ์ฌ์ฉ์์ ์ง๋ฌธ์ด ๋ถ์์ ํ๊ฑฐ๋ ๋ชจํธํด๋ ์ง๋ฌธ ์๋๋ฅผ ์ถ๋ก ํ์ฌ ์ ์ ํ๊ฒ ์ฌ๊ตฌ์ฑํ๋ผ. |
|
|
**๋ฌธ์ ๊ธฐ๋ฐ ์๋ต ์์น (์ ๋ ์ถ์ธก ๊ธ์ง):** |
|
|
1. ์ ๊ณต๋ ๋ฌธ์๋ฅผ **๋งค์ฐ ๊ผผ๊ผผํ** ์ฝ๊ณ ์ ํํ ์ ๋ณด๋ฅผ ์ฐพ์ผ์ธ์ |
|
|
2. **๋ฐ๋์ ๋ฌธ์์์ ๊ทผ๊ฑฐ๋ฅผ ์ฐพ์ ๋ต๋ณ**ํ๊ณ , ๋ฌธ์์ ์๋ ๋ด์ฉ์ ์์๋ก ์ถ์ธกํ์ง ๋ง๊ณ **"๋ฌธ์์์ ๊ด๋ จ ์ ๋ณด๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค"**๋ผ๊ณ ๋ช
์ํ๋ผ |
|
|
3. **๋ฌธ์์ ์ ํ ๋ฌด๊ดํ ์ง๋ฌธ**(์: ์ ์ฌ ์ถ์ฒ, ๋ ์จ, ์ผ์ ๋ํ ๋ฑ)์ **"์ฃ์กํ์ง๋ง, ์ ๊ณต๋ ๋ฌธ์์๋ ํด๋น ์ง๋ฌธ๊ณผ ๊ด๋ จ๋ ์ ๋ณด๊ฐ ํฌํจ๋์ด ์์ง ์์ต๋๋ค."**๋ผ๊ณ ๋ง ๋ต๋ณํ๊ณ ์ถ๊ฐ ์ค๋ช
์์ด ์ข
๋ฃํ๋ผ |
|
|
4. ๋ฌธ์์ ์ ๋ณด๊ฐ ์๋๋ฐ๋ "์๋ค"๊ณ ํ์ง ๋ง์ธ์ |
|
|
**ํต์ฌ ์ ๋ณด ์ฐ์ ์ถ์ถ:** |
|
|
- ๊ธ์ก, ์๋, ๊ท๊ฒฉ, ์ผ์ , ์๊ตฌ์กฐ๊ฑด ๋ฑ **์์น ๊ธฐ๋ฐ ์ ๋ณด๋ฅผ ์ต์ฐ์ **์ผ๋ก ์๋ณํ๊ณ ์ ํํ๊ฒ ๋ฐํํ๋ผ |
|
|
- ์ซ์, ๊ธ์ก, ๋ ์ง ๋ฑ ๊ตฌ์ฒด์ ์ธ ์ ๋ณด๋ฅผ ์ฐ์ ์ ์ผ๋ก ์ฐพ์ผ์ธ์ |
|
|
**๋ต๋ณ ํ์:** |
|
|
- ๋ต๋ณ ์ ๋ฐ๋์ **[ํ์ด์ง X]** ํํ๋ก ์ถ์ฒ๋ฅผ ๋ช
์ํ์ธ์ |
|
|
- **์ ๋ ์ค์**: "๋ฌธ์ 1", "๋ฌธ์ 2" ๊ฐ์ ํ๊ธฐ๋ ์ ๋ ์ฌ์ฉํ์ง ๋ง์ธ์ |
|
|
- ํต์ฌ ๋ต๋ณ์ ๋จผ์ ๋ช
ํํ๊ฒ ์ ์ |
|
|
- ๋งํฌ๋ค์ด ํ์์ผ๋ก๋ง ๋ต๋ณํ์ธ์ |
|
|
- ์ง๋ฌธ์ ๋ฐ๋ผ ๊ฐ์ฅ ์ ์ ํ ๊ตฌ์กฐ๋ก ๋ต๋ณํ์ธ์ (๋จ๊ณ๋ณ, ์นดํ
๊ณ ๋ฆฌ๋ณ, ์๊ฐ์ ๋ฑ) |
|
|
|
|
|
**์๋ฌธ ์ธ์ฉ ๊ท์น (ํ์ด๋ผ์ดํธ์ฉ):** |
|
|
- ํต์ฌ ๋ด์ฉ์ ์ค๋ช
ํ ๋๋ ํฐ๋ฐ์ดํ("")๋ก PDF ์๋ฌธ์ ๊ทธ๋๋ก ์ธ์ฉํ์ธ์ |
|
|
- ํฐ๋ฐ์ดํ ์์ ๋ด์ฉ์ PDF ์๋ฌธ์ **ํ ๊ธ์๋ ๋ฐ๊พธ์ง ๋ง๊ณ ** ๊ทธ๋๋ก ๋ณต์ฌ |
|
|
- ๋ฌธ์ฅ ์ข
๊ฒฐ์ด("~ํจ", "~์", "~์์ฒญํจ" ๋ฑ)๋ ์๋ฌธ ๊ทธ๋๋ก ์ ์ง |
|
|
- ์ธ์ฉ ์์: "๊ธฐ์ ํ๊ฐ ์ ์๊ฐ ๋ฐฐ์ ํ๋(100์ )์ 85% ์ด์์ธ ์๋ฅผ ๊ธฐ์ ํ๊ฐ ์ ๊ฒฉ์๋ก ์ ์ " [ํ์ด์ง 9] |
|
|
- ์๋ฌธ ์ธ์ฉ ํ ํ์ํ๋ฉด ๋ถ์ฐ ์ค๋ช
์ถ๊ฐ ๊ฐ๋ฅ""" |
|
|
|
|
|
user_prompt = f"""๋ค์ ๋ฌธ์๋ค์ ๋งค์ฐ ๊ผผ๊ผผํ ์ฝ๊ณ ์ง๋ฌธ์ ๋ต๋ณํ์ธ์. |
|
|
|
|
|
<๋ฌธ์> |
|
|
{context} |
|
|
</๋ฌธ์> |
|
|
|
|
|
<์ง๋ฌธ> |
|
|
{query} |
|
|
</์ง๋ฌธ> |
|
|
|
|
|
**๋ต๋ณ ์์ฑ ๊ฐ์ด๋:** |
|
|
|
|
|
1. **๊ตฌ์กฐํ**: ์ง๋ฌธ ์ ํ์ ๋ง๋ ๊ฐ์ฅ ์ฝ๊ธฐ ์ฌ์ด ๊ตฌ์กฐ ์ ํ |
|
|
- ์ ์ฐจ/ํ๋ก์ธ์ค ์ง๋ฌธ โ ๋จ๊ณ๋ณ ๋ฒํธ (1, 2, 3...) |
|
|
- ํญ๋ชฉ ๋์ด ์ง๋ฌธ โ ๋ถ๋ฆฟ ํฌ์ธํธ (โข ๋๋ *) |
|
|
- ๋น๊ต/์ ํ ์ง๋ฌธ โ ์นดํ
๊ณ ๋ฆฌ๋ณ ๊ตฌ๋ถ |
|
|
|
|
|
2. **์๋ฌธ ์ธ์ฉ**: ํต์ฌ ๋ด์ฉ์ ํฐ๋ฐ์ดํ๋ก PDF ์๋ฌธ ๊ทธ๋๋ก ์ธ์ฉ |
|
|
- ์: "๊ธฐ์ ํ๊ฐ ์ ๊ฒฉ์๋ฅผ ๋์์ผ๋ก ๊ฐ๊ฒฉ ์
์ฐฐ์ ์ค์ํ์ฌ, ํ๊ตญ์๋์ฐจ์ฐ๊ตฌ์์ ์์ ๊ฐ๊ฒฉ์ดํ ์ต์ ๊ฐ๊ฒฉ ํฌ์ฐฐ์๋ฅผ ๋์ฐฐ์๋ก ์ ์ " [ํ์ด์ง 9] |
|
|
- ํฐ๋ฐ์ดํ ์ = ์๋ฌธ ๊ทธ๋๋ก (์ ๋ ์์ญ ๊ธ์ง) |
|
|
|
|
|
3. **์ถ์ฒ ํ๊ธฐ**: ๋ชจ๋ ์ ๋ณด์ [ํ์ด์ง X] ํ๊ธฐ |
|
|
|
|
|
4. **ํ์**: ๋งํฌ๋ค์ด๋ง ์ฌ์ฉ, "๋ฌธ์ 1" ๊ฐ์ ํ๊ธฐ ๊ธ์ง""" |
|
|
|
|
|
headers = { |
|
|
"Content-Type": "application/json", |
|
|
"Authorization": f"Bearer {api_key}" |
|
|
} |
|
|
|
|
|
payload = { |
|
|
"model": "grok-3", |
|
|
"messages": [ |
|
|
{"role": "system", "content": system_prompt}, |
|
|
{"role": "user", "content": user_prompt} |
|
|
], |
|
|
"temperature": 0.1, |
|
|
"max_tokens": 2000, |
|
|
"stream": False |
|
|
} |
|
|
|
|
|
try: |
|
|
response = requests.post( |
|
|
f"{GROK_API_BASE}/chat/completions", |
|
|
headers=headers, |
|
|
json=payload, |
|
|
timeout=120 |
|
|
) |
|
|
|
|
|
if response.status_code != 200: |
|
|
error_detail = "" |
|
|
try: |
|
|
error_data = response.json() |
|
|
error_detail = error_data.get('error', {}).get('message', '') |
|
|
except Exception: |
|
|
error_detail = response.text |
|
|
|
|
|
return f"โ API ์ค๋ฅ (์ฝ๋: {response.status_code})\n์์ธ: {error_detail}" |
|
|
|
|
|
result = response.json() |
|
|
return result["choices"][0]["message"]["content"] |
|
|
|
|
|
except Exception as e: |
|
|
return f"โ ์ค๋ฅ: {str(e)}" |
|
|
|
|
|
|
|
|
def highlight_text_in_pdf(pdf_bytes: bytes, highlight_info: List[Dict]) -> bytes: |
|
|
""" |
|
|
PyMuPDF ๊ธฐ๋ฐ์ ํ์ด๋ผ์ดํธ ํจ์ - ์ ์ฒด ์ฐ์ , ์คํจ์์๋ง ๋ถํ |
|
|
""" |
|
|
doc = fitz.open(stream=pdf_bytes, filetype="pdf") |
|
|
yellow_color = [1.0, 1.0, 0.0] |
|
|
|
|
|
def normalize_text(text): |
|
|
"""ํ
์คํธ ์ ๊ทํ""" |
|
|
return re.sub(r'\s+', ' ', text.strip()) |
|
|
|
|
|
def merge_rects(rects, threshold=5): |
|
|
"""๊ฒน์น๊ฑฐ๋ ์ธ์ ํ ์ฌ๊ฐํ๋ค์ ๋ณํฉ""" |
|
|
if not rects: |
|
|
return [] |
|
|
|
|
|
|
|
|
sorted_rects = sorted(rects, key=lambda r: (r.y0, r.x0)) |
|
|
merged = [sorted_rects[0]] |
|
|
|
|
|
for rect in sorted_rects[1:]: |
|
|
last = merged[-1] |
|
|
|
|
|
if abs(rect.y0 - last.y0) < threshold: |
|
|
if rect.x0 <= last.x1 + threshold: |
|
|
merged[-1] = fitz.Rect( |
|
|
min(last.x0, rect.x0), |
|
|
min(last.y0, rect.y0), |
|
|
max(last.x1, rect.x1), |
|
|
max(last.y1, rect.y1) |
|
|
) |
|
|
else: |
|
|
merged.append(rect) |
|
|
|
|
|
elif rect.y0 <= last.y1 + 20: |
|
|
merged.append(rect) |
|
|
else: |
|
|
merged.append(rect) |
|
|
|
|
|
return merged |
|
|
|
|
|
def find_text_across_lines(page, search_text): |
|
|
"""์ค๋ฐ๊ฟ์ ๋์ด์ ํ
์คํธ ์ฐพ๊ธฐ - ๊ณต๋ฐฑ ๋ฌด์ ๋น๊ต""" |
|
|
found_rects = [] |
|
|
|
|
|
|
|
|
blocks = page.get_text("dict")["blocks"] |
|
|
|
|
|
|
|
|
lines_info = [] |
|
|
|
|
|
for block in blocks: |
|
|
if "lines" not in block: |
|
|
continue |
|
|
for line in block["lines"]: |
|
|
line_text = "" |
|
|
for span in line["spans"]: |
|
|
line_text += span["text"] |
|
|
if line_text.strip(): |
|
|
lines_info.append((line_text, fitz.Rect(line["bbox"]))) |
|
|
|
|
|
if not lines_info: |
|
|
return [] |
|
|
|
|
|
|
|
|
search_no_space = search_text.lower().replace(" ", "").replace("\n", "") |
|
|
|
|
|
|
|
|
for start_idx in range(len(lines_info)): |
|
|
combined_text = "" |
|
|
combined_bboxes = [] |
|
|
|
|
|
for end_idx in range(start_idx, min(start_idx + 5, len(lines_info))): |
|
|
line_text, line_bbox = lines_info[end_idx] |
|
|
combined_text += line_text |
|
|
combined_bboxes.append(line_bbox) |
|
|
|
|
|
|
|
|
combined_no_space = combined_text.lower().replace(" ", "").replace("\n", "") |
|
|
|
|
|
|
|
|
if search_no_space in combined_no_space: |
|
|
|
|
|
for bbox in combined_bboxes: |
|
|
found_rects.append(bbox) |
|
|
print(f" โ
๋ผ์ธ ๋งค์นญ ({start_idx+1}~{end_idx+1}์ค): {len(combined_bboxes)}๊ฐ ์์ญ") |
|
|
return merge_rects(found_rects) |
|
|
|
|
|
return [] |
|
|
|
|
|
def find_text_with_pymupdf(page, search_text): |
|
|
"""PyMuPDF๋ก ํ
์คํธ ์ฐพ๊ธฐ - ์ ํํ๊ณ ๊น๋ํ๊ฒ""" |
|
|
found_rects = [] |
|
|
search_text = search_text.strip() |
|
|
|
|
|
print(f" ๊ฒ์ ์ค...") |
|
|
|
|
|
|
|
|
instances = page.search_for(search_text) |
|
|
if instances: |
|
|
print(f" โ
์ฑ๊ณต [์๋ณธ]: {len(instances)}๊ฐ") |
|
|
return merge_rects(instances) |
|
|
|
|
|
|
|
|
normalized = normalize_text(search_text) |
|
|
if normalized != search_text: |
|
|
instances = page.search_for(normalized) |
|
|
if instances: |
|
|
print(f" โ
์ฑ๊ณต [์ ๊ทํ]: {len(instances)}๊ฐ") |
|
|
return merge_rects(instances) |
|
|
|
|
|
|
|
|
line_results = find_text_across_lines(page, search_text) |
|
|
if line_results: |
|
|
return line_results |
|
|
|
|
|
print(f" โ ๏ธ ๋ผ์ธ ๋งค์นญ ์คํจ โ ํต์ฌ ๊ตฌ๋ฌธ") |
|
|
|
|
|
|
|
|
if len(search_text) > 50: |
|
|
|
|
|
front = search_text[:30] |
|
|
front_inst = page.search_for(front) |
|
|
if front_inst: |
|
|
print(f" โ
์๋ถ๋ถ ๋งค์นญ: {front[:20]}...") |
|
|
found_rects.extend(front_inst[:1]) |
|
|
|
|
|
|
|
|
back = search_text[-20:] |
|
|
back_inst = page.search_for(back) |
|
|
if back_inst: |
|
|
print(f" โ
๋ท๋ถ๋ถ ๋งค์นญ: ...{back[:15]}") |
|
|
found_rects.extend(back_inst[:1]) |
|
|
|
|
|
if found_rects: |
|
|
return merge_rects(found_rects) |
|
|
|
|
|
print(f" โ ๏ธ ํต์ฌ ๊ตฌ๋ฌธ ์คํจ โ ํค์๋") |
|
|
|
|
|
|
|
|
keywords = re.findall(r'[๊ฐ-ํฃ]{10,}', search_text) |
|
|
if not keywords: |
|
|
keywords = re.findall(r'[๊ฐ-ํฃ]{7,}', search_text) |
|
|
|
|
|
if keywords: |
|
|
for kw in keywords[:2]: |
|
|
inst = page.search_for(kw) |
|
|
if inst: |
|
|
print(f" โ
ํค์๋: {kw}") |
|
|
found_rects.extend(inst[:1]) |
|
|
|
|
|
if found_rects: |
|
|
return merge_rects(found_rects) |
|
|
|
|
|
|
|
|
print(f" ์ตํ: ๋ธ๋ก") |
|
|
blocks = page.get_text("dict")["blocks"] |
|
|
search_norm = normalize_text(search_text.lower()) |
|
|
|
|
|
for block in blocks: |
|
|
if "lines" not in block: |
|
|
continue |
|
|
|
|
|
block_text = "" |
|
|
for line in block["lines"]: |
|
|
for span in line["spans"]: |
|
|
block_text += span["text"] + " " |
|
|
|
|
|
block_norm = normalize_text(block_text.lower()) |
|
|
|
|
|
if search_norm in block_norm: |
|
|
found_rects.append(fitz.Rect(block["bbox"])) |
|
|
print(f" โ
๋ธ๋ก ์ผ์น") |
|
|
break |
|
|
|
|
|
return merge_rects(found_rects) if found_rects else [] |
|
|
|
|
|
print(f"\n{'='*80}") |
|
|
print(f"ํ์ด๋ผ์ดํธ ์์ - ์ด {len(highlight_info)}๊ฐ ํญ๋ชฉ") |
|
|
print(f"{'='*80}") |
|
|
|
|
|
total_success = 0 |
|
|
total_failed = 0 |
|
|
|
|
|
for idx, item in enumerate(highlight_info, 1): |
|
|
page_num = item['page'] - 1 |
|
|
text_to_highlight = item['text'].strip() |
|
|
|
|
|
if page_num >= len(doc): |
|
|
print(f"\n[{idx}] โ ํ์ด์ง ์ค๋ฅ: {page_num + 1}") |
|
|
total_failed += 1 |
|
|
continue |
|
|
|
|
|
page = doc[page_num] |
|
|
|
|
|
print(f"\n[{idx}/{len(highlight_info)}]") |
|
|
print(f" ๐ ํ์ด์ง: {page_num + 1}") |
|
|
print(f" ๐ ๊ธธ์ด: {len(text_to_highlight)}์") |
|
|
print(f" ๐ฌ ๋ด์ฉ: {text_to_highlight[:70]}...") |
|
|
|
|
|
|
|
|
found_rects = find_text_with_pymupdf(page, text_to_highlight) |
|
|
|
|
|
|
|
|
unique_rects = [] |
|
|
for rect in found_rects: |
|
|
is_duplicate = False |
|
|
for existing in unique_rects: |
|
|
|
|
|
if (abs(rect.x0 - existing.x0) < 5 and |
|
|
abs(rect.y0 - existing.y0) < 5 and |
|
|
abs(rect.x1 - existing.x1) < 5 and |
|
|
abs(rect.y1 - existing.y1) < 5): |
|
|
is_duplicate = True |
|
|
break |
|
|
if not is_duplicate: |
|
|
unique_rects.append(rect) |
|
|
|
|
|
|
|
|
highlighted_count = 0 |
|
|
for rect in unique_rects: |
|
|
try: |
|
|
highlight = page.add_highlight_annot(rect) |
|
|
highlight.set_colors(stroke=yellow_color) |
|
|
highlight.update() |
|
|
highlighted_count += 1 |
|
|
except Exception as e: |
|
|
print(f" โ ํ์ด๋ผ์ดํธ ์คํจ: {e}") |
|
|
|
|
|
if highlighted_count > 0: |
|
|
print(f" โ
์๋ฃ: {highlighted_count}๊ฐ ์์ญ") |
|
|
total_success += 1 |
|
|
else: |
|
|
print(f" โ ์คํจ: ํ
์คํธ๋ฅผ ์ฐพ์ ์ ์์") |
|
|
total_failed += 1 |
|
|
|
|
|
print(f"\n{'='*80}") |
|
|
print(f"๐ ์ต์ข
๊ฒฐ๊ณผ: โ
์ฑ๊ณต {total_success}๊ฐ / โ ์คํจ {total_failed}๊ฐ") |
|
|
print(f"{'='*80}\n") |
|
|
|
|
|
output_bytes = doc.tobytes() |
|
|
doc.close() |
|
|
return output_bytes |
|
|
|
|
|
def extract_highlights_from_grok(grok_result: Dict) -> List[Dict]: |
|
|
if "error" in grok_result: |
|
|
return [] |
|
|
|
|
|
highlights = [] |
|
|
selected_text = grok_result.get("selected_text", "") |
|
|
page = grok_result.get("page", 1) |
|
|
|
|
|
if selected_text and len(selected_text) <= 150: |
|
|
highlights.append({ |
|
|
'text': selected_text, |
|
|
'page': page |
|
|
}) |
|
|
|
|
|
return highlights |
|
|
|
|
|
|
|
|
|
|
|
def extract_highlights_from_answer(answer: str) -> List[Dict]: |
|
|
""" |
|
|
๋ต๋ณ์์ ํ์ด๋ผ์ดํธํ ํ
์คํธ ์ถ์ถ |
|
|
[ํ์ด์ง X] ์๋ค ๋ชจ๋ ํด๋น ํ์ด์ง๋ก ๊ฐ์ฃผ |
|
|
""" |
|
|
highlights = [] |
|
|
|
|
|
print(f"\n{'='*80}") |
|
|
print(f"๋ต๋ณ ํ
์คํธ ๋ถ์ ์ค...") |
|
|
print(f"{'='*80}\n") |
|
|
|
|
|
|
|
|
page_pattern = r'\[\s*ํ์ด์ง\s*(\d+)\s*\]' |
|
|
page_matches = list(re.finditer(page_pattern, answer)) |
|
|
|
|
|
print(f"๐ [ํ์ด์ง] ํ๊ทธ {len(page_matches)}๊ฐ ๋ฐ๊ฒฌ\n") |
|
|
|
|
|
quoted_matches = [] |
|
|
list_matches = [] |
|
|
|
|
|
|
|
|
for i, match in enumerate(page_matches): |
|
|
page_num = match.group(1) |
|
|
tag_start = match.start() |
|
|
tag_end = match.end() |
|
|
|
|
|
|
|
|
|
|
|
section_start = 0 |
|
|
if i > 0: |
|
|
section_start = page_matches[i-1].end() |
|
|
|
|
|
|
|
|
before_section = answer[section_start:tag_start] |
|
|
|
|
|
|
|
|
last_para_match = re.search(r'([-*โ]\s+.+)$', before_section, re.DOTALL) |
|
|
if last_para_match: |
|
|
before_text = last_para_match.group(1) |
|
|
print(f"--- ํ์ด์ง {page_num} ์๋ถ๋ถ (๊ธธ์ด: {len(before_text)}์) ---") |
|
|
print(f"{before_text[:150]}...\n") |
|
|
|
|
|
|
|
|
quotes = re.findall(r'"([^"]+)"', before_text) |
|
|
for quote in quotes: |
|
|
quote_clean = quote.strip() |
|
|
if len(quote_clean) > 10: |
|
|
quoted_matches.append((quote_clean, int(page_num))) |
|
|
print(f" โ [์-์ธ์ฉ๋ฌธ] \"{quote_clean[:60]}...\"") |
|
|
|
|
|
|
|
|
next_page_pos = len(answer) |
|
|
if i + 1 < len(page_matches): |
|
|
next_page_pos = page_matches[i + 1].start() |
|
|
|
|
|
section = answer[tag_end:next_page_pos] |
|
|
print(f"--- ํ์ด์ง {page_num} ๋ท๋ถ๋ถ (๊ธธ์ด: {len(section)}์) ---") |
|
|
print(f"{section[:150]}...\n") |
|
|
|
|
|
|
|
|
quotes = re.findall(r'"([^"]+)"', section) |
|
|
for quote in quotes: |
|
|
quote_clean = quote.strip() |
|
|
if len(quote_clean) > 10: |
|
|
quoted_matches.append((quote_clean, int(page_num))) |
|
|
print(f" โ [๋ค-์ธ์ฉ๋ฌธ] \"{quote_clean[:60]}...\"") |
|
|
|
|
|
|
|
|
lines = section.split('\n') |
|
|
for line in lines: |
|
|
line_stripped = line.strip() |
|
|
|
|
|
if len(line_stripped) < 3: |
|
|
continue |
|
|
|
|
|
if line_stripped.startswith('**') or line_stripped.startswith('#'): |
|
|
continue |
|
|
|
|
|
item = None |
|
|
|
|
|
if line_stripped.startswith('โ'): |
|
|
item = line_stripped[1:].strip() |
|
|
elif line_stripped.startswith('- ') or line_stripped.startswith('* '): |
|
|
item = line_stripped[2:].strip() |
|
|
elif re.match(r'^\d+\.\s+', line_stripped): |
|
|
match_obj = re.match(r'^\d+\.\s+(.+)$', line_stripped) |
|
|
if match_obj: |
|
|
item = match_obj.group(1).strip() |
|
|
|
|
|
if item: |
|
|
item = re.sub(r'\[\s*ํ์ด์ง\s*\d+\s*\]', '', item).strip() |
|
|
item = re.sub(r'\*\*([^*]+)\*\*', r'\1', item).strip() |
|
|
item = re.sub(r'\([""""][^)]+[""""\)]+', '', item).strip() |
|
|
item = re.sub(r'\s*\([^)]{0,50}\)\s*$', '', item).strip() |
|
|
|
|
|
if 3 <= len(item) <= 200: |
|
|
list_matches.append((item, int(page_num))) |
|
|
print(f" โ [๋ฆฌ์คํธ] {item[:50]}...") |
|
|
|
|
|
print(f"\n{'='*40}") |
|
|
print(f"๐ ์ธ์ฉ๋ฌธ: {len(quoted_matches)}๊ฐ") |
|
|
print(f"๐ ๋ฆฌ์คํธ: {len(list_matches)}๊ฐ") |
|
|
print(f"{'='*40}\n") |
|
|
|
|
|
|
|
|
all_matches = [] |
|
|
|
|
|
if quoted_matches and list_matches: |
|
|
all_short = all(len(q[0]) <= 30 for q in quoted_matches) |
|
|
if all_short: |
|
|
print(f"โ ์งง์ ์ธ์ฉ๋ฌธ + ๋ฆฌ์คํธ ๋ชจ๋") |
|
|
all_matches = quoted_matches + list_matches |
|
|
else: |
|
|
print(f"โ ์ธ์ฉ๋ฌธ๋ง") |
|
|
all_matches = quoted_matches |
|
|
elif quoted_matches: |
|
|
print(f"โ ์ธ์ฉ๋ฌธ๋ง") |
|
|
all_matches = quoted_matches |
|
|
elif list_matches: |
|
|
print(f"โ ๋ฆฌ์คํธ๋ง") |
|
|
all_matches = list_matches |
|
|
|
|
|
|
|
|
seen = set() |
|
|
for text, page in all_matches: |
|
|
if text and (text, page) not in seen: |
|
|
highlights.append({ |
|
|
'text': text, |
|
|
'page': page |
|
|
}) |
|
|
seen.add((text, page)) |
|
|
|
|
|
print(f"\n{'='*80}") |
|
|
print(f"โ
์ต์ข
์ถ์ถ: {len(highlights)}๊ฐ") |
|
|
for i, h in enumerate(highlights, 1): |
|
|
print(f" [{i}] ํ์ด์ง {h['page']}: {h['text'][:60]}...") |
|
|
print(f"{'='*80}\n") |
|
|
|
|
|
return highlights |
|
|
|
|
|
|
|
|
|
|
|
def render_pdf_with_highlights(pdf_bytes: bytes, highlight_info: List[Dict], zoom_level: float = 2.0): |
|
|
highlighted_pdf = highlight_text_in_pdf(pdf_bytes, highlight_info) |
|
|
doc = fitz.open(stream=highlighted_pdf, filetype="pdf") |
|
|
highlighted_pages = set(h['page'] for h in highlight_info) |
|
|
|
|
|
pdf_html = '<div class="pdf-container" id="pdf-viewer-container">' |
|
|
|
|
|
for page_num in range(len(doc)): |
|
|
page = doc[page_num] |
|
|
pix = page.get_pixmap(matrix=fitz.Matrix(zoom_level, zoom_level)) |
|
|
img_data = pix.tobytes("png") |
|
|
img_base64 = base64.b64encode(img_data).decode() |
|
|
|
|
|
zoom_percentage = int(zoom_level * 50) |
|
|
page_id = f'page-{page_num + 1}' |
|
|
pdf_html += f'<div id="{page_id}" style="margin-bottom: 2rem; position: relative;">' |
|
|
|
|
|
if (page_num + 1) in highlighted_pages: |
|
|
pdf_html += f'<div style="background: #FEF08A; color: #854D0E; padding: 0.5rem; margin-bottom: 0.5rem; border-radius: 0.3rem; font-weight: bold; border-left: 4px solid #EAB308;">โญ ํ์ด์ง {page_num + 1}</div>' |
|
|
else: |
|
|
pdf_html += f'<div style="background: #ADADAD; color: white; padding: 0.5rem; margin-bottom: 0.5rem; border-radius: 0.3rem; font-weight: bold;"> ํ์ด์ง {page_num + 1}</div>' |
|
|
|
|
|
pdf_html += f'<img src="data:image/png;base64,{img_base64}" style="width: {zoom_percentage}%; border: 1px solid #E2E8F0; border-radius: 0.3rem; box-shadow: 0 1px 3px rgba(0,0,0,0.1); display: block; margin: 0 auto;" />' |
|
|
pdf_html += '</div>' |
|
|
|
|
|
pdf_html += '</div>' |
|
|
doc.close() |
|
|
return pdf_html |
|
|
|
|
|
|
|
|
def main(): |
|
|
init_session() |
|
|
|
|
|
if not st.session_state.processed: |
|
|
col1, col2, col3 = st.columns([1, 1, 1]) |
|
|
with col2: |
|
|
st.markdown("<div style='height: 30vh;'></div>", unsafe_allow_html=True) |
|
|
st.image("img/plobin-grey.png", use_container_width=True) |
|
|
st.text(' ') |
|
|
|
|
|
with st.sidebar: |
|
|
st.image("img/plobin-right-only.png", width=85) |
|
|
|
|
|
uploaded_file = st.file_uploader( |
|
|
"๋๋๊ทธํ์ฌ ํ์ผ์ ์
๋ก๋ ๋๋ ํด๋ฆญํ์ฌ ์ ํํ์ธ์.", |
|
|
type=['pdf'], |
|
|
label_visibility="visible", |
|
|
help="PDF ํ์ผ๋ง ์
๋ก๋ ๊ฐ๋ฅํฉ๋๋ค (์ต๋ 200MB)" |
|
|
) |
|
|
|
|
|
if uploaded_file: |
|
|
if st.button("๋ฌธ์ ์ฒ๋ฆฌ ์์", type="primary", use_container_width=True): |
|
|
if not GROK_API_KEY or not OPENAI_API_KEY: |
|
|
st.error("โ ๏ธ GROK_API_KEY ๋๋ OPENAI_API_KEY๊ฐ .env ํ์ผ์ ์ค์ ๋์ง ์์์ต๋๋ค!") |
|
|
st.stop() |
|
|
|
|
|
st.session_state.vector_db = None |
|
|
st.session_state.embedder = None |
|
|
st.session_state.chat_history = [] |
|
|
st.session_state.current_highlights = [] |
|
|
|
|
|
with st.spinner("๋ฌธ์ ๋ถ์์ ์์ํฉ๋๋ค..."): |
|
|
try: |
|
|
chunks, metadata_list, pdf_bytes, pages_text = extract_text_from_pdf(uploaded_file) |
|
|
|
|
|
with st.spinner("ํต์ฌ ๋ด์ฉ์ ํ์
ํ๊ณ ์์ต๋๋ค..."): |
|
|
collection, embedder = create_vector_db(chunks, metadata_list) |
|
|
|
|
|
st.session_state.vector_db = collection |
|
|
st.session_state.embedder = embedder |
|
|
st.session_state.pdf_bytes = pdf_bytes |
|
|
st.session_state.pdf_pages_text = pages_text |
|
|
st.session_state.processed = True |
|
|
st.session_state.doc_metadata = { |
|
|
"filename": uploaded_file.name, |
|
|
"chunks": len(chunks), |
|
|
"pages": len(set(m['page'] for m in metadata_list)) |
|
|
} |
|
|
|
|
|
|
|
|
saved_file = save_extracted_text_to_file( |
|
|
chunks, |
|
|
metadata_list, |
|
|
uploaded_file.name |
|
|
) |
|
|
|
|
|
st.success(f"๋ฌธ์ ์ฒ๋ฆฌ ์๋ฃ!") |
|
|
st.rerun() |
|
|
|
|
|
except Exception as e: |
|
|
st.error(f"์ค๋ฅ: {str(e)}") |
|
|
|
|
|
if st.session_state.processed: |
|
|
st.markdown("#### ๋ฌธ์ ์ ๋ณด") |
|
|
st.info(f"**{st.session_state.doc_metadata['filename']}**") |
|
|
st.info(f"ํ์ด์ง: {st.session_state.doc_metadata['pages']}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if st.session_state.processed: |
|
|
col1, col2 = st.columns([1, 1]) |
|
|
|
|
|
with col1: |
|
|
header_cols = st.columns([7, 1, 1.5, 1]) |
|
|
with header_cols[0]: |
|
|
st.markdown("### ") |
|
|
|
|
|
if st.session_state.pdf_bytes: |
|
|
pdf_html = render_pdf_with_highlights( |
|
|
st.session_state.pdf_bytes, |
|
|
st.session_state.current_highlights, |
|
|
st.session_state.zoom_level |
|
|
) |
|
|
st.markdown(pdf_html, unsafe_allow_html=True) |
|
|
|
|
|
if st.session_state.scroll_to_page: |
|
|
scroll_js = f""" |
|
|
<script> |
|
|
const container = parent.document.querySelector('.pdf-container'); |
|
|
const targetPage = parent.document.getElementById('page-{st.session_state.scroll_to_page}'); |
|
|
|
|
|
if (container && targetPage) {{ |
|
|
const containerRect = container.getBoundingClientRect(); |
|
|
const targetRect = targetPage.getBoundingClientRect(); |
|
|
const scrollTop = container.scrollTop; |
|
|
const offset = targetRect.top - containerRect.top + scrollTop; |
|
|
|
|
|
container.scrollTo({{ |
|
|
top: offset - 20, |
|
|
behavior: 'smooth' |
|
|
}}); |
|
|
}} |
|
|
</script> |
|
|
""" |
|
|
components.html(scroll_js, height=0) |
|
|
st.session_state.scroll_to_page = None |
|
|
|
|
|
with col2: |
|
|
st.markdown('### ', unsafe_allow_html=True) |
|
|
|
|
|
chat_container = st.container(height=650) |
|
|
|
|
|
with chat_container: |
|
|
for msg_idx, msg in enumerate(st.session_state.chat_history): |
|
|
with st.chat_message(msg["role"]): |
|
|
st.markdown(msg["content"]) |
|
|
|
|
|
prompt = st.chat_input("์ง๋ฌธ์ ์
๋ ฅํ์ธ์...", key="chat_input") |
|
|
|
|
|
if prompt: |
|
|
st.session_state.chat_history.append({"role": "user", "content": prompt}) |
|
|
st.session_state.processing_query = prompt |
|
|
st.rerun() |
|
|
|
|
|
|
|
|
if st.session_state.processing_query: |
|
|
query = st.session_state.processing_query |
|
|
st.session_state.processing_query = None |
|
|
|
|
|
with st.spinner("PLOBIN์ด ์ต์ ์ ๋ต๋ณ์ ์ฐพ๊ณ ์์ต๋๋ค..."): |
|
|
try: |
|
|
search_results = hybrid_search( |
|
|
query, |
|
|
st.session_state.vector_db, |
|
|
st.session_state.embedder, |
|
|
top_k=3 |
|
|
) |
|
|
|
|
|
grok_result = grok_verify_and_extract( |
|
|
query, |
|
|
search_results, |
|
|
GROK_API_KEY |
|
|
) |
|
|
|
|
|
answer = generate_answer( |
|
|
query, |
|
|
search_results, |
|
|
GROK_API_KEY |
|
|
) |
|
|
|
|
|
|
|
|
print("\n" + "="*80) |
|
|
print("๋ต๋ณ์์ ์ธ์ฉ๋ฌธ ์ถ์ถ ์ค...") |
|
|
print("="*80) |
|
|
highlights = extract_highlights_from_answer(answer) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.session_state.current_highlights = highlights |
|
|
|
|
|
if grok_result and "page" in grok_result and "error" not in grok_result: |
|
|
st.session_state.scroll_to_page = grok_result["page"] |
|
|
|
|
|
chat_data = { |
|
|
"role": "assistant", |
|
|
"content": answer |
|
|
} |
|
|
st.session_state.chat_history.append(chat_data) |
|
|
st.rerun() |
|
|
|
|
|
except Exception as e: |
|
|
error_msg = f"โ ์ค๋ฅ: {str(e)}" |
|
|
st.session_state.chat_history.append({ |
|
|
"role": "assistant", |
|
|
"content": error_msg |
|
|
}) |
|
|
st.rerun() |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|