Spaces:
Running
Running
Update docSim.py
Browse files
docSim.py
CHANGED
|
@@ -19,6 +19,16 @@ def semantic_similarity(text1, text2):
|
|
| 19 |
emb2 = MODEL.encode(text2, normalize_embeddings=True)
|
| 20 |
return float(util.cos_sim(emb1, emb2))
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
def calcDocSims(file):
|
| 23 |
texts = re.findall(r'text=(.*?),\s*error=', file, flags=re.DOTALL)
|
| 24 |
res_list = [extract_core_painpoints(t) for t in texts]
|
|
|
|
| 19 |
emb2 = MODEL.encode(text2, normalize_embeddings=True)
|
| 20 |
return float(util.cos_sim(emb1, emb2))
|
| 21 |
|
| 22 |
+
def extract_core_painpoints(text):
|
| 23 |
+
pattern = r"核心痛点[::\s]*([\s\S]*?)优化措施[::\s]*"
|
| 24 |
+
m = re.search(pattern, text, flags=re.S)
|
| 25 |
+
if m:
|
| 26 |
+
res = m.group(1).strip()
|
| 27 |
+
# 去掉编号
|
| 28 |
+
res = re.sub(r"-?\s*核心教学痛点\d*[::]\s*", "", res)
|
| 29 |
+
return res
|
| 30 |
+
return ""
|
| 31 |
+
|
| 32 |
def calcDocSims(file):
|
| 33 |
texts = re.findall(r'text=(.*?),\s*error=', file, flags=re.DOTALL)
|
| 34 |
res_list = [extract_core_painpoints(t) for t in texts]
|