Till Fischer
commited on
Commit
·
8aac46d
1
Parent(s):
564a8b1
Clean commit ohne Tokens
Browse files- analyze_aspects.py +194 -0
- aspect-sentiment-analyzer/.gitattributes +35 -0
- aspect-sentiment-analyzer/README.md +12 -0
analyze_aspects.py
CHANGED
|
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# analyze_aspects.py
|
| 3 |
+
|
| 4 |
+
#python /Users/fischer/Desktop/HanserMVP/scraping/analyze_aspects.py --isbn "9783446264199" --db-path /Users/fischer/Desktop/buch_datenbank.sqlite --languages de
|
| 5 |
+
# python analyze_aspects.py --isbn "9783446264199" --db-path /Pfad/zur/sqlite.db --languages de
|
| 6 |
+
|
| 7 |
+
import sqlite3
|
| 8 |
+
import argparse
|
| 9 |
+
import logging
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
import nltk
|
| 12 |
+
from transformers import pipeline
|
| 13 |
+
from collections import defaultdict
|
| 14 |
+
import matplotlib.pyplot as plt
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def visualize_aspects(aspect_results: dict[str, list[float]], output_dir: Path, filename: str = "sentiment_aspekte.png"):
|
| 18 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 19 |
+
|
| 20 |
+
aspects = list(aspect_results.keys())
|
| 21 |
+
avg_scores = [sum(scores) / len(scores) for scores in aspect_results.values()]
|
| 22 |
+
colors = ['green' if score > 0.1 else 'red' if score < -0.1 else 'gray' for score in avg_scores]
|
| 23 |
+
|
| 24 |
+
plt.figure(figsize=(10, 6))
|
| 25 |
+
bars = plt.barh(aspects, avg_scores, color=colors)
|
| 26 |
+
plt.axvline(x=0, color='black', linewidth=0.8)
|
| 27 |
+
plt.xlabel("Durchschnittlicher Sentiment-Score")
|
| 28 |
+
plt.title("Sentiment-Analyse pro Aspekt")
|
| 29 |
+
|
| 30 |
+
for bar, score in zip(bars, avg_scores):
|
| 31 |
+
plt.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height() / 2,
|
| 32 |
+
f"{score:.2f}", va='center')
|
| 33 |
+
|
| 34 |
+
plt.tight_layout()
|
| 35 |
+
plt.gca().invert_yaxis()
|
| 36 |
+
|
| 37 |
+
output_path = output_dir / filename
|
| 38 |
+
plt.savefig(output_path, dpi=300)
|
| 39 |
+
plt.close()
|
| 40 |
+
|
| 41 |
+
logger.info(f"Diagramm gespeichert unter: {output_path}")
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# NLTK punkt model for sentence tokenization
|
| 45 |
+
nltk.download('punkt')
|
| 46 |
+
from nltk import sent_tokenize
|
| 47 |
+
|
| 48 |
+
# Logging Configuration
|
| 49 |
+
def configure_logging():
|
| 50 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 51 |
+
return logging.getLogger(__name__)
|
| 52 |
+
|
| 53 |
+
logger = configure_logging()
|
| 54 |
+
|
| 55 |
+
# Aspekt-Label-Maps
|
| 56 |
+
ASPECT_LABEL_MAP = {
|
| 57 |
+
"Handlung": ["Handlung", "Plot", "Story", "Aufbau"],
|
| 58 |
+
"Charaktere": ["Charaktere", "Figuren", "Protagonisten", "Nebenfiguren", "Beziehungen"],
|
| 59 |
+
"Stil": ["Stil", "Sprachstil", "Sprache", "Erzählweise"],
|
| 60 |
+
"Emotionale Wirkung": ["Lesevergnügen", "Berührend", "Bewegend", "Begeisternd", "Spannend"],
|
| 61 |
+
"Tiefgang": ["Tiefgang", "Nachdenklich", "Philosophisch", "kritisch"],
|
| 62 |
+
"Thema & Kontext": ["Thema", "Motiv", "Zeitgeschehen", "Historischer Kontext", "Gesellschaft"],
|
| 63 |
+
"Originalität": ["Originalität", "Kreativität", "Innovativ", "Idee", 'Humor'],
|
| 64 |
+
"Recherche & Authentizität": ["Recherche", "Authentizität", "Realismus", "Fakten"]
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
ASPECT_LABEL_MAP_EN = {
|
| 68 |
+
"Plot": ["Plot", "Story", "Narrative", "Structure"],
|
| 69 |
+
"Characters": ["Characters", "Protagonists", "Antagonists", "Relationships"],
|
| 70 |
+
"Style": ["Style", "Language", "Tone", "Narration"],
|
| 71 |
+
"Emotional Impact": ["Touching", "Funny", "Exciting", "Moving", "Engaging"],
|
| 72 |
+
"Depth": ["Philosophical", "Thought-provoking", "Insightful", "Critical"],
|
| 73 |
+
"Theme & Context": ["Theme", "Motif", "Historical Context", "Social Issues"],
|
| 74 |
+
"Originality": ["Originality", "Creativity", "Innovation", "Idea"],
|
| 75 |
+
"Research & Authenticity": ["Research", "Authenticity", "Realism", "Facts"]
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
ALL_LABELS = [label for labels in ASPECT_LABEL_MAP.values() for label in labels]
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
# --- Datenbankzugriff ---
|
| 82 |
+
|
| 83 |
+
def load_reviews(db_path: Path, isbn: str) -> list:
|
| 84 |
+
conn = sqlite3.connect(db_path)
|
| 85 |
+
cursor = conn.cursor()
|
| 86 |
+
cursor.execute(
|
| 87 |
+
"SELECT id, cleaned_text, cleaned_text_en FROM reviews_und_notizen WHERE buch_isbn = ?",
|
| 88 |
+
(isbn,)
|
| 89 |
+
)
|
| 90 |
+
rows = cursor.fetchall()
|
| 91 |
+
conn.close()
|
| 92 |
+
texts_to_analyze = []
|
| 93 |
+
for review_id, text_de, text_en in rows:
|
| 94 |
+
if text_de and isinstance(text_de, str):
|
| 95 |
+
texts_to_analyze.append((review_id, text_de, 'de'))
|
| 96 |
+
if text_en and isinstance(text_en, str):
|
| 97 |
+
texts_to_analyze.append((review_id, text_en, 'en'))
|
| 98 |
+
return texts_to_analyze
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
# --- Analysefunktion ---
|
| 102 |
+
|
| 103 |
+
def analyze_quickwin(db_path: Path, isbn: str, device: int = -1, languages: list[str] = ["de", "en"]) -> dict:
|
| 104 |
+
reviews = load_reviews(db_path, isbn)
|
| 105 |
+
reviews = [r for r in reviews if r[2] in languages]
|
| 106 |
+
if not reviews:
|
| 107 |
+
logger.warning(f"Keine gesäuberten Reviews für ISBN {isbn} in den gewählten Sprachen gefunden.")
|
| 108 |
+
return {}
|
| 109 |
+
|
| 110 |
+
zsl = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device, multi_label=True)
|
| 111 |
+
sent_de = pipeline("sentiment-analysis", model="oliverguhr/german-sentiment-bert", device=device)
|
| 112 |
+
sent_en = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=device)
|
| 113 |
+
|
| 114 |
+
aspect_results = defaultdict(list)
|
| 115 |
+
total_aspects = 0
|
| 116 |
+
|
| 117 |
+
for review_id, text, lang in reviews:
|
| 118 |
+
if not text:
|
| 119 |
+
continue
|
| 120 |
+
|
| 121 |
+
logger.info(f"Review ID {review_id} ({lang}) wird verarbeitet.")
|
| 122 |
+
sentences = sent_tokenize(text, language='german' if lang == 'de' else 'english')
|
| 123 |
+
|
| 124 |
+
if lang == 'de':
|
| 125 |
+
aspect_map = ASPECT_LABEL_MAP
|
| 126 |
+
all_labels = ALL_LABELS
|
| 127 |
+
sent_pipeline = sent_de
|
| 128 |
+
hypothesis_template = "Dieser Satz handelt von {}."
|
| 129 |
+
elif lang == 'en':
|
| 130 |
+
aspect_map = ASPECT_LABEL_MAP_EN
|
| 131 |
+
all_labels = [label for labels in aspect_map.values() for label in labels]
|
| 132 |
+
sent_pipeline = sent_en
|
| 133 |
+
hypothesis_template = "This sentence is about {}."
|
| 134 |
+
else:
|
| 135 |
+
continue
|
| 136 |
+
|
| 137 |
+
for sent in sentences:
|
| 138 |
+
if not sent.strip() or len(sent) < 15:
|
| 139 |
+
continue
|
| 140 |
+
|
| 141 |
+
result = zsl(sent, candidate_labels=all_labels, hypothesis_template=hypothesis_template)
|
| 142 |
+
|
| 143 |
+
main_label = ""
|
| 144 |
+
best_score = 0.0
|
| 145 |
+
for label, score in zip(result["labels"], result["scores"]):
|
| 146 |
+
if score > 0.8:
|
| 147 |
+
main_label = next((k for k, v in aspect_map.items() if label in v), label)
|
| 148 |
+
best_score = score
|
| 149 |
+
break
|
| 150 |
+
|
| 151 |
+
if not main_label:
|
| 152 |
+
continue
|
| 153 |
+
|
| 154 |
+
ml_sentiment = sent_pipeline(sent)[0]
|
| 155 |
+
ml_score = ml_sentiment['score'] if ml_sentiment['label'].upper().startswith('POS') else -ml_sentiment['score']
|
| 156 |
+
final_score = ml_score
|
| 157 |
+
final_label = 'POS' if final_score > 0.1 else 'NEG' if final_score < -0.1 else 'NEU'
|
| 158 |
+
|
| 159 |
+
print(
|
| 160 |
+
f"Review {review_id} ({lang}) | Satz: {sent}\n"
|
| 161 |
+
f" Aspekt: {main_label} (via '{result['labels'][0]}', {best_score:.2f}) | "
|
| 162 |
+
f"ML: {ml_sentiment['label']}({ml_sentiment['score']:.2f}) -> Final: {final_label}({final_score:.2f})"
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
aspect_results[main_label].append(final_score)
|
| 166 |
+
total_aspects += 1
|
| 167 |
+
|
| 168 |
+
logger.info(f"Total aspects found: {total_aspects}")
|
| 169 |
+
return aspect_results
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
# --- Entry Point ---
|
| 173 |
+
|
| 174 |
+
def main():
|
| 175 |
+
parser = argparse.ArgumentParser(description="Quick-Win ABSA ohne SentiWS")
|
| 176 |
+
parser.add_argument("--db-path", required=True, help="Pfad zur SQLite-Datenbank")
|
| 177 |
+
parser.add_argument("--isbn", required=True, help="ISBN des Buchs")
|
| 178 |
+
parser.add_argument("--gpu", action="store_true", help="GPU verwenden (device=0)")
|
| 179 |
+
parser.add_argument("--languages", nargs="+", choices=["de", "en"], default=["de", "en"],
|
| 180 |
+
help="Sprachen der Reviews, z. B. --languages de oder --languages de en")
|
| 181 |
+
args = parser.parse_args()
|
| 182 |
+
|
| 183 |
+
device = 0 if args.gpu else -1
|
| 184 |
+
aspect_results = analyze_quickwin(
|
| 185 |
+
Path(args.db_path), args.isbn,
|
| 186 |
+
device=device,
|
| 187 |
+
languages=args.languages
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
if aspect_results:
|
| 191 |
+
output_dir = Path("output")
|
| 192 |
+
visualize_aspects(aspect_results, output_dir)
|
| 193 |
+
else:
|
| 194 |
+
logger.info("Keine Aspekt-Daten zur Visualisierung verfügbar.")
|
aspect-sentiment-analyzer/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
aspect-sentiment-analyzer/README.md
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Aspect Sentiment Analyzer
|
| 3 |
+
emoji: 🌖
|
| 4 |
+
colorFrom: yellow
|
| 5 |
+
colorTo: pink
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 5.34.2
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|