Luke
commited on
Commit
·
03b6d75
1
Parent(s):
2d2df69
no message
Browse files- IdentifyModel/cardModel.py +23 -12
- Plan/AiLLM.py +19 -4
- Plan/pytesseractOCR.py +3 -15
- Preprocess/preprocessImg.py +11 -1
- app.py +27 -48
- requirements.txt +1 -1
IdentifyModel/cardModel.py
CHANGED
|
@@ -1,26 +1,37 @@
|
|
|
|
|
| 1 |
|
| 2 |
-
|
|
|
|
| 3 |
if validation_type == "身分證正面":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
result = {
|
| 5 |
"解析全文內容": text,
|
| 6 |
-
"姓名": entities.get('B-PER', '無法解析')
|
| 7 |
-
"出生年月日":
|
| 8 |
-
"發證日期":
|
| 9 |
-
"統一編號":
|
| 10 |
}
|
| 11 |
elif validation_type == "身分證反面":
|
| 12 |
result = {
|
| 13 |
"解析全文內容": text,
|
| 14 |
-
"父": entities.get('B-FATHER', '無法解析')
|
| 15 |
-
"母": entities.get('B-MOTHER', '無法解析')
|
| 16 |
-
"配偶": entities.get('B-SPOUSE', '無法解析')
|
| 17 |
-
"出生地": entities.get('B-LOC', '無法解析')
|
| 18 |
-
"住址": entities.get('I-LOC', '無法解析')
|
| 19 |
-
"編號": entities.get('B-ID', '無法解析')
|
| 20 |
}
|
| 21 |
else:
|
| 22 |
result = {
|
| 23 |
"解析全文內容": text,
|
| 24 |
}
|
| 25 |
|
| 26 |
-
return result
|
|
|
|
| 1 |
+
import re
|
| 2 |
|
| 3 |
+
|
| 4 |
+
def parse_id_card(text, validation_type, entities):
|
| 5 |
if validation_type == "身分證正面":
|
| 6 |
+
# 正則表達式
|
| 7 |
+
birthdate_pattern = r"民國\s*\d+\s*年\s*\d+\s*月\s*\d+\s*日"
|
| 8 |
+
issue_date_pattern = r"民國\s*\d+\s*年\s*\d+\s*月\s*\d+\s*日(\S+)(?:補發|換發)"
|
| 9 |
+
unified_id_pattern = r"[A-Za-z]\d{9}"
|
| 10 |
+
|
| 11 |
+
birthdate = re.search(birthdate_pattern, text)
|
| 12 |
+
issue_date = re.search(issue_date_pattern, text)
|
| 13 |
+
unified_id = re.search(unified_id_pattern, text)
|
| 14 |
+
|
| 15 |
result = {
|
| 16 |
"解析全文內容": text,
|
| 17 |
+
"姓名": entities.get('B-PER', '無法解析'),
|
| 18 |
+
"出生年月日": birthdate.group() if birthdate else '無法解析',
|
| 19 |
+
"發證日期": issue_date.group() if issue_date else '無法解析',
|
| 20 |
+
"統一編號": unified_id.group() if unified_id else '無法解析'
|
| 21 |
}
|
| 22 |
elif validation_type == "身分證反面":
|
| 23 |
result = {
|
| 24 |
"解析全文內容": text,
|
| 25 |
+
"父": entities.get('B-FATHER', '無法解析'),
|
| 26 |
+
"母": entities.get('B-MOTHER', '無法解析'),
|
| 27 |
+
"配偶": entities.get('B-SPOUSE', '無法解析'),
|
| 28 |
+
"出生地": entities.get('B-LOC', '無法解析'),
|
| 29 |
+
"住址": entities.get('I-LOC', '無法解析'),
|
| 30 |
+
"編號": entities.get('B-ID', '無法解析')
|
| 31 |
}
|
| 32 |
else:
|
| 33 |
result = {
|
| 34 |
"解析全文內容": text,
|
| 35 |
}
|
| 36 |
|
| 37 |
+
return result
|
Plan/AiLLM.py
CHANGED
|
@@ -1,14 +1,29 @@
|
|
| 1 |
-
import os
|
| 2 |
import pytesseract
|
| 3 |
-
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
|
| 4 |
from IdentifyModel.cardModel import parse_id_card
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
# 初始化 Taiwanese BERT 模型
|
| 7 |
-
tokenizer = AutoTokenizer.from_pretrained("ckiplab/bert-base-chinese")
|
| 8 |
-
model = AutoModelForTokenClassification.from_pretrained("ckiplab/bert-base-chinese-ner")
|
| 9 |
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)
|
| 10 |
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
def llm_recognition(image, validation_type, language):
|
| 13 |
text = pytesseract.image_to_string(image, lang=language)
|
| 14 |
ner_results = ner_pipeline(text)
|
|
|
|
|
|
|
| 1 |
import pytesseract
|
|
|
|
| 2 |
from IdentifyModel.cardModel import parse_id_card
|
| 3 |
+
from transformers import BertTokenizer, BertForTokenClassification
|
| 4 |
+
from transformers import pipeline
|
| 5 |
+
|
| 6 |
+
# 加載預訓練模型和分詞器
|
| 7 |
+
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
|
| 8 |
+
model = BertForTokenClassification.from_pretrained("ckiplab/bert-base-chinese-ner")
|
| 9 |
|
| 10 |
# 初始化 Taiwanese BERT 模型
|
|
|
|
|
|
|
| 11 |
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)
|
| 12 |
|
| 13 |
|
| 14 |
+
def extract_entities(text):
|
| 15 |
+
ner_results = ner_pipeline(text)
|
| 16 |
+
entities = {}
|
| 17 |
+
for result in ner_results:
|
| 18 |
+
entity = result['entity']
|
| 19 |
+
word = result['word']
|
| 20 |
+
if entity not in entities:
|
| 21 |
+
entities[entity] = word
|
| 22 |
+
else:
|
| 23 |
+
entities[entity] += word
|
| 24 |
+
return entities
|
| 25 |
+
|
| 26 |
+
|
| 27 |
def llm_recognition(image, validation_type, language):
|
| 28 |
text = pytesseract.image_to_string(image, lang=language)
|
| 29 |
ner_results = ner_pipeline(text)
|
Plan/pytesseractOCR.py
CHANGED
|
@@ -1,26 +1,14 @@
|
|
| 1 |
-
# import cv2
|
| 2 |
-
import os
|
| 3 |
import pytesseract
|
| 4 |
|
| 5 |
from IdentifyModel.cardModel import parse_id_card
|
| 6 |
-
from
|
| 7 |
|
| 8 |
|
| 9 |
def ocr_recognition(image, validation_type, language):
|
| 10 |
try:
|
| 11 |
custom_config = r'--oem 3 --psm 6'
|
| 12 |
text = pytesseract.image_to_string(image, lang=language, config=custom_config)
|
| 13 |
-
|
|
|
|
| 14 |
except Exception as e:
|
| 15 |
return str(e)
|
| 16 |
-
|
| 17 |
-
# def ocr_recognition_2(image: str, lang: str = 'chi_tra') -> str:
|
| 18 |
-
# try:
|
| 19 |
-
# img = cv2.imread(image)
|
| 20 |
-
# gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
| 21 |
-
# threshold_img = cv2.threshold(gray, 127, 255, cv2.THRESH_TOZERO)[1]
|
| 22 |
-
# result = pytesseract.image_to_string(threshold_img, lang=lang)
|
| 23 |
-
# os.remove(image)
|
| 24 |
-
# return result
|
| 25 |
-
# except Exception as e:
|
| 26 |
-
# return str(e)
|
|
|
|
|
|
|
|
|
|
| 1 |
import pytesseract
|
| 2 |
|
| 3 |
from IdentifyModel.cardModel import parse_id_card
|
| 4 |
+
from Plan.AiLLM import extract_entities
|
| 5 |
|
| 6 |
|
| 7 |
def ocr_recognition(image, validation_type, language):
|
| 8 |
try:
|
| 9 |
custom_config = r'--oem 3 --psm 6'
|
| 10 |
text = pytesseract.image_to_string(image, lang=language, config=custom_config)
|
| 11 |
+
entities = extract_entities(text)
|
| 12 |
+
return parse_id_card(text, validation_type, entities)
|
| 13 |
except Exception as e:
|
| 14 |
return str(e)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Preprocess/preprocessImg.py
CHANGED
|
@@ -16,4 +16,14 @@ def preprocess_image001(image):
|
|
| 16 |
_, binary = cv2.threshold(np.array(enhanced_image), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
| 17 |
# 去雜訊
|
| 18 |
denoised = cv2.fastNlMeansDenoising(binary, None, 30, 7, 21)
|
| 19 |
-
return Image.fromarray(denoised)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
_, binary = cv2.threshold(np.array(enhanced_image), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
| 17 |
# 去雜訊
|
| 18 |
denoised = cv2.fastNlMeansDenoising(binary, None, 30, 7, 21)
|
| 19 |
+
return Image.fromarray(denoised)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def preprocess_image002(image):
|
| 23 |
+
# 將 PIL Image 轉換為 numpy array
|
| 24 |
+
image_np = np.array(image)
|
| 25 |
+
# 使用 OpenCV 進行預處理
|
| 26 |
+
gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY) # 灰階化
|
| 27 |
+
gray = cv2.bilateralFilter(gray, 11, 17, 17) # 雙邊濾波去噪
|
| 28 |
+
edged = cv2.Canny(gray, 30, 200) # 邊緣檢測
|
| 29 |
+
return Image.fromarray(edged)
|
app.py
CHANGED
|
@@ -1,80 +1,59 @@
|
|
| 1 |
import os
|
| 2 |
import gradio as gr
|
| 3 |
-
import pytesseract
|
| 4 |
-
|
| 5 |
from Plan.AiLLM import llm_recognition
|
| 6 |
from Plan.pytesseractOCR import ocr_recognition
|
| 7 |
-
from Preprocess.preprocessImg import preprocess_image001
|
| 8 |
-
|
| 9 |
-
langs = []
|
| 10 |
-
|
| 11 |
-
choices = os.popen('tesseract --list-langs').read().split('\n')[1:-1]
|
| 12 |
-
|
| 13 |
-
# If you don't have tesseract executable in your PATH, include the following:
|
| 14 |
-
# pytesseract.pytesseract.tesseract_cmd = r'<full_path_to_your_tesseract_executable>'
|
| 15 |
-
# Example tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract'
|
| 16 |
-
|
| 17 |
-
# Simple image to string
|
| 18 |
-
# print(pytesseract.image_to_string(Image.open('eurotext.png')))
|
| 19 |
-
|
| 20 |
-
# # French text image to string
|
| 21 |
-
# print(pytesseract.image_to_string(Image.open('test-european.jpg'), lang='fra'))
|
| 22 |
-
|
| 23 |
-
# # Get bounding box estimates
|
| 24 |
-
# print(pytesseract.image_to_boxes(Image.open('test.png')))
|
| 25 |
-
|
| 26 |
-
# # Get verbose data including boxes, confidences, line and page numbers
|
| 27 |
-
# print(pytesseract.image_to_data(Image.open('test.png')))
|
| 28 |
-
|
| 29 |
-
# # Get information about orientation and script detection
|
| 30 |
-
# print(pytesseract.image_to_osd(Image.open('test.png'))
|
| 31 |
-
|
| 32 |
|
| 33 |
# 取得所有語言清單
|
| 34 |
languages = os.popen('tesseract --list-langs').read().split('\n')[1:-1]
|
| 35 |
|
| 36 |
-
print(' ======================================================== ')
|
| 37 |
-
# print(' ###### choices:' + choices)
|
| 38 |
-
# print(' ###### GET ENV - TESSDATA_PREFIX:' + os.getenv('TESSDATA_PREFIX'))
|
| 39 |
-
# print(' ###### OS - TESSDATA_PREFIX:' + os.environ['TESSDATA_PREFIX'])
|
| 40 |
-
# os.environ['TESSDATA_PREFIX'] = os.getenv('TESSDATA_PREFIX')
|
| 41 |
-
# print(' ###### Tesseract_Cmd:' + pytesseract.pytesseract.tesseract_cmd)
|
| 42 |
-
# pytesseract.pytesseract.tesseract_cmd = os.getenv('TESSDATA_PREFIX')
|
| 43 |
-
print(' ======================================================== ')
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
-
def preprocess_and_ocr(image, validation_type, language):
|
| 47 |
-
preprocessed_image = preprocess_image001(image)
|
| 48 |
-
ocr_result = ocr_recognition(preprocessed_image, validation_type, language)
|
| 49 |
-
return preprocessed_image, ocr_result
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
-
|
| 53 |
-
preprocessed_image = preprocess_image001(image)
|
| 54 |
-
llm_result = llm_recognition(preprocessed_image, validation_type, language)
|
| 55 |
-
return preprocessed_image, llm_result
|
| 56 |
|
| 57 |
|
| 58 |
with gr.Blocks() as demo:
|
| 59 |
with gr.Row():
|
| 60 |
image_input = gr.Image(type="pil", label="上傳圖片")
|
| 61 |
-
|
|
|
|
| 62 |
|
| 63 |
with gr.Row():
|
| 64 |
validation_type = gr.Dropdown(choices=["身分證正面", "身分證反面"], label="驗證類別")
|
| 65 |
language_dropdown = gr.Dropdown(choices=languages, value="chi_tra", label="語言")
|
|
|
|
| 66 |
|
| 67 |
with gr.Row():
|
| 68 |
ocr_button = gr.Button("使用 OCR")
|
| 69 |
llm_button = gr.Button("使用 AI LLM")
|
| 70 |
|
| 71 |
with gr.Row():
|
| 72 |
-
|
| 73 |
-
|
|
|
|
|
|
|
| 74 |
|
| 75 |
ocr_button.click(preprocess_and_ocr, inputs=[image_input, validation_type, language_dropdown],
|
| 76 |
-
outputs=[
|
| 77 |
llm_button.click(preprocess_and_llm, inputs=[image_input, validation_type, language_dropdown],
|
| 78 |
-
outputs=[
|
| 79 |
|
| 80 |
demo.launch(share=False)
|
|
|
|
| 1 |
import os
|
| 2 |
import gradio as gr
|
|
|
|
|
|
|
| 3 |
from Plan.AiLLM import llm_recognition
|
| 4 |
from Plan.pytesseractOCR import ocr_recognition
|
| 5 |
+
from Preprocess.preprocessImg import preprocess_image001, preprocess_image002
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
# 取得所有語言清單
|
| 8 |
languages = os.popen('tesseract --list-langs').read().split('\n')[1:-1]
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
+
def preprocess_and_ocr(image, valid_type, language):
|
| 12 |
+
# 方案一
|
| 13 |
+
pre_img_001 = preprocess_image001(image)
|
| 14 |
+
ocr_result_001 = ocr_recognition(pre_img_001, valid_type, language)
|
| 15 |
+
# 方案二
|
| 16 |
+
pre_img_002 = preprocess_image002(image)
|
| 17 |
+
ocr_result_002 = ocr_recognition(pre_img_002, valid_type, language)
|
| 18 |
+
|
| 19 |
+
return pre_img_001, pre_img_002, ocr_result_001, ocr_result_002
|
| 20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
+
def preprocess_and_llm(image, valid_type, language):
|
| 23 |
+
# 方案一
|
| 24 |
+
pre_img_001 = preprocess_image001(image)
|
| 25 |
+
llm_result_001 = llm_recognition(pre_img_001, valid_type, language)
|
| 26 |
+
# 方案二
|
| 27 |
+
pre_img_002 = preprocess_image002(image)
|
| 28 |
+
llm_result_002 = llm_recognition(pre_img_002, valid_type, language)
|
| 29 |
|
| 30 |
+
return pre_img_001, pre_img_002, llm_result_001, llm_result_002
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
|
| 33 |
with gr.Blocks() as demo:
|
| 34 |
with gr.Row():
|
| 35 |
image_input = gr.Image(type="pil", label="上傳圖片")
|
| 36 |
+
preprocess_output_001 = gr.Image(type="pil", label="預處理後的圖片-方案一")
|
| 37 |
+
preprocess_output_002 = gr.Image(type="pil", label="預處理後的圖片-方案二")
|
| 38 |
|
| 39 |
with gr.Row():
|
| 40 |
validation_type = gr.Dropdown(choices=["身分證正面", "身分證反面"], label="驗證類別")
|
| 41 |
language_dropdown = gr.Dropdown(choices=languages, value="chi_tra", label="語言")
|
| 42 |
+
# preprocessed_type = gr.Radio(["001", "002"], label="解析方案")
|
| 43 |
|
| 44 |
with gr.Row():
|
| 45 |
ocr_button = gr.Button("使用 OCR")
|
| 46 |
llm_button = gr.Button("使用 AI LLM")
|
| 47 |
|
| 48 |
with gr.Row():
|
| 49 |
+
ocr_output_001 = gr.JSON(label="OCR-001-解析結果")
|
| 50 |
+
ocr_output_002 = gr.JSON(label="OCR-002-解析結果")
|
| 51 |
+
llm_output_001 = gr.JSON(label="AiLLM-001 解析結果")
|
| 52 |
+
llm_output_002 = gr.JSON(label="AiLLM-002 解析結果")
|
| 53 |
|
| 54 |
ocr_button.click(preprocess_and_ocr, inputs=[image_input, validation_type, language_dropdown],
|
| 55 |
+
outputs=[preprocess_output_001, preprocess_output_002, ocr_output_001, ocr_output_002])
|
| 56 |
llm_button.click(preprocess_and_llm, inputs=[image_input, validation_type, language_dropdown],
|
| 57 |
+
outputs=[preprocess_output_001, preprocess_output_002, llm_output_001, llm_output_002])
|
| 58 |
|
| 59 |
demo.launch(share=False)
|
requirements.txt
CHANGED
|
@@ -4,4 +4,4 @@ transformers
|
|
| 4 |
Pillow
|
| 5 |
torch
|
| 6 |
huggingface-hub
|
| 7 |
-
opencv-python
|
|
|
|
| 4 |
Pillow
|
| 5 |
torch
|
| 6 |
huggingface-hub
|
| 7 |
+
opencv-python
|