| | """ |
| | AIFinder Feature Extraction |
| | TF-IDF and stylometric features for AI model detection. |
| | """ |
| |
|
| | import re |
| | import numpy as np |
| | from scipy.sparse import csr_matrix, hstack |
| | from sklearn.feature_extraction.text import TfidfVectorizer |
| | from sklearn.base import BaseEstimator, TransformerMixin |
| | from sklearn.preprocessing import MaxAbsScaler |
| |
|
| | from config import TFIDF_WORD_PARAMS, TFIDF_CHAR_PARAMS |
| |
|
| |
|
| | def strip_cot(text): |
| | text = re.sub(r"<think(?:ing)?>.*?</think(?:ing)?>", "", text, flags=re.DOTALL) |
| | return text.strip() |
| |
|
| |
|
| | def strip_markdown(text): |
| | text = re.sub(r"```[\s\S]*?```", "", text) |
| | text = re.sub(r"`[^`]+`", "", text) |
| | text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text) |
| | text = re.sub(r"\*([^*]+)\*", r"\1", text) |
| | text = re.sub(r"__([^_]+)__", r"\1", text) |
| | text = re.sub(r"_([^_]+)_", r"\1", text) |
| | text = re.sub(r"^#{1,6}\s+", "", text, flags=re.MULTILINE) |
| | text = re.sub(r"^[\s]*[-*+]\s+", "", text, flags=re.MULTILINE) |
| | text = re.sub(r"^\s*\d+[.)]\s+", "", text, flags=re.MULTILINE) |
| | text = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", text) |
| | text = re.sub(r"^>.*$", "", text, flags=re.MULTILINE) |
| | text = re.sub(r"^---+$", "", text, flags=re.MULTILINE) |
| | return text.strip() |
| |
|
| |
|
| | class StylometricFeatures(BaseEstimator, TransformerMixin): |
| | def fit(self, X, y=None): |
| | return self |
| |
|
| | def transform(self, X): |
| | features = [] |
| | for text in X: |
| | features.append(self._extract(text)) |
| | return csr_matrix(np.array(features, dtype=np.float32)) |
| |
|
| | def _extract(self, text): |
| | words = text.split() |
| | n_chars = max(len(text), 1) |
| | n_words = max(len(words), 1) |
| |
|
| | sentences = re.split(r"[.!?]+", text) |
| | sentences = [s.strip() for s in sentences if s.strip()] |
| | n_sentences = max(len(sentences), 1) |
| |
|
| | paragraphs = text.split("\n\n") |
| | non_empty_paras = [p for p in paragraphs if p.strip()] |
| | n_paragraphs = len(non_empty_paras) |
| |
|
| | lines = text.split("\n") |
| | non_empty_lines = [l for l in lines if l.strip()] |
| | n_lines = max(len(non_empty_lines), 1) |
| |
|
| | |
| | word_lens = [len(w) for w in words] |
| | avg_word_len = np.mean(word_lens) if words else 0 |
| | word_len_std = np.std(word_lens) if len(words) > 1 else 0 |
| | median_word_len = np.median(word_lens) if words else 0 |
| | avg_sent_len = n_words / n_sentences |
| |
|
| | |
| | n_commas = text.count(",") / n_chars |
| | n_semicolons = text.count(";") / n_chars |
| | n_colons = text.count(":") / n_chars |
| | n_dash = (text.count("—") + text.count("–") + text.count("--")) / n_chars |
| | n_parens = (text.count("(") + text.count(")")) / n_chars |
| | n_quotes = (text.count('"') + text.count("'")) / n_chars |
| | n_exclaim = text.count("!") / n_chars |
| | n_question = text.count("?") / n_chars |
| | n_period = text.count(".") / n_chars |
| | n_ellipsis = (text.count("...") + text.count("…")) / n_chars |
| |
|
| | comma_colon_ratio = n_commas / (n_colons + 0.001) |
| | comma_period_ratio = n_commas / (n_period + 0.001) |
| | excl_question_ratio = n_exclaim / (n_question + 0.001) |
| |
|
| | |
| | n_headers = len(re.findall(r"^#{1,6}\s", text, re.MULTILINE)) / n_sentences |
| | n_bold = len(re.findall(r"\*\*.*?\*\*", text)) / n_sentences |
| | n_code_blocks = len(re.findall(r"```", text)) / n_sentences |
| | n_inline_code = len(re.findall(r"`[^`]+`", text)) / n_sentences |
| | n_bullet = len(re.findall(r"^[\s]*[-*+]\s", text, re.MULTILINE)) / n_sentences |
| | n_numbered = len(re.findall(r"^\s*\d+[.)]\s", text, re.MULTILINE)) / n_sentences |
| | n_tables = len(re.findall(r"\|.*\|", text)) / n_sentences |
| |
|
| | |
| | newline_density = text.count("\n") / n_chars |
| | double_newline_ratio = text.count("\n\n") / (text.count("\n") + 1) |
| | uppercase_ratio = sum(1 for c in text if c.isupper()) / n_chars |
| | digit_ratio = sum(1 for c in text if c.isdigit()) / n_chars |
| | space_ratio = sum(1 for c in text if c.isspace()) / n_chars |
| |
|
| | unique_chars = len(set(text)) / n_chars |
| | unique_chars_ratio = len(set(text.lower())) / n_chars |
| |
|
| | |
| | sent_lens = [len(s.split()) for s in sentences] |
| | sent_len_std = np.std(sent_lens) if len(sent_lens) > 1 else 0 |
| | sent_len_max = max(sent_lens) if sent_lens else 0 |
| | sent_len_min = min(sent_lens) if sent_lens else 0 |
| | sent_len_median = np.median(sent_lens) if sent_lens else 0 |
| | sent_len_range = sent_len_max - sent_len_min |
| |
|
| | |
| | has_think = 1.0 if re.search(r"<think>", text) else 0.0 |
| | has_xml = 1.0 if re.search(r"<[^>]+>", text) else 0.0 |
| | has_hr = 1.0 if re.search(r"^---+", text, re.MULTILINE) else 0.0 |
| | has_url = 1.0 if re.search(r"https?://", text) else 0.0 |
| |
|
| | |
| | words_lower = [w.lower().strip(".,!?;:'\"()[]{}") for w in words] |
| |
|
| | first_person = { |
| | "i", |
| | "me", |
| | "my", |
| | "mine", |
| | "myself", |
| | "we", |
| | "us", |
| | "our", |
| | "ours", |
| | "ourselves", |
| | } |
| | second_person = {"you", "your", "yours", "yourself", "yourselves"} |
| | third_person = {"he", "she", "it", "they", "them", "his", "her", "its", "their"} |
| |
|
| | first_person_ratio = sum(1 for w in words_lower if w in first_person) / n_words |
| | second_person_ratio = ( |
| | sum(1 for w in words_lower if w in second_person) / n_words |
| | ) |
| | third_person_ratio = sum(1 for w in words_lower if w in third_person) / n_words |
| |
|
| | |
| | unique_words = len(set(words_lower)) |
| | ttr = unique_words / n_words if n_words > 0 else 0 |
| | hapax = sum(1 for w in set(words_lower) if words_lower.count(w) == 1) |
| | hapax_ratio = hapax / n_words if n_words > 0 else 0 |
| |
|
| | contraction_count = len(re.findall(r"\b\w+'\w+\b", text)) |
| | contraction_ratio = contraction_count / n_words if n_words > 0 else 0 |
| |
|
| | |
| | sentences_starters = [ |
| | s.split()[0].lower() if s.split() else "" for s in sentences |
| | ] |
| | starter_vocab = ( |
| | len(set(sentences_starters)) / n_sentences if n_sentences > 0 else 0 |
| | ) |
| |
|
| | and_starts = sum(1 for s in sentences_starters if s == "and") / n_sentences |
| | but_starts = sum(1 for s in sentences_starters if s == "but") / n_sentences |
| | so_starts = sum(1 for s in sentences_starters if s == "so") / n_sentences |
| | the_starts = sum(1 for s in sentences_starters if s == "the") / n_sentences |
| | it_starts = ( |
| | sum(1 for s in sentences_starters if s in ("it", "it's")) / n_sentences |
| | ) |
| | i_starts = ( |
| | sum(1 for s in sentences_starters if s in ("i", "i'm", "i've")) |
| | / n_sentences |
| | ) |
| |
|
| | |
| | short_word_ratio = sum(1 for w in words_lower if len(w) <= 2) / n_words |
| | medium_word_ratio = sum(1 for w in words_lower if 3 <= len(w) <= 6) / n_words |
| | long_word_ratio = sum(1 for w in words_lower if len(w) >= 7) / n_words |
| | very_long_word_ratio = sum(1 for w in words_lower if len(w) >= 10) / n_words |
| |
|
| | |
| | para_lens = ( |
| | [len(p.split()) for p in non_empty_paras] if non_empty_paras else [0] |
| | ) |
| | avg_para_len = np.mean(para_lens) |
| | para_len_std = np.std(para_lens) if len(para_lens) > 1 else 0 |
| |
|
| | |
| | conjunctions = { |
| | "and", |
| | "but", |
| | "or", |
| | "nor", |
| | "for", |
| | "yet", |
| | "so", |
| | "because", |
| | "although", |
| | "while", |
| | "if", |
| | "when", |
| | "where", |
| | } |
| | discourse = { |
| | "however", |
| | "therefore", |
| | "moreover", |
| | "furthermore", |
| | "nevertheless", |
| | "consequently", |
| | "thus", |
| | "hence", |
| | } |
| | hedging = { |
| | "perhaps", |
| | "maybe", |
| | "might", |
| | "could", |
| | "possibly", |
| | "seemingly", |
| | "apparently", |
| | "arguably", |
| | "potentially", |
| | } |
| | certainty = { |
| | "definitely", |
| | "certainly", |
| | "absolutely", |
| | "clearly", |
| | "obviously", |
| | "undoubtedly", |
| | "indeed", |
| | "surely", |
| | } |
| | transition = { |
| | "additionally", |
| | "meanwhile", |
| | "subsequently", |
| | "alternatively", |
| | "specifically", |
| | "notably", |
| | "importantly", |
| | "essentially", |
| | } |
| |
|
| | conjunction_ratio = sum(1 for w in words_lower if w in conjunctions) / n_words |
| | discourse_ratio = sum(1 for w in words_lower if w in discourse) / n_words |
| | hedging_ratio = sum(1 for w in words_lower if w in hedging) / n_words |
| | certainty_ratio = sum(1 for w in words_lower if w in certainty) / n_words |
| | transition_ratio = sum(1 for w in words_lower if w in transition) / n_words |
| |
|
| | |
| | question_starts = sum( |
| | 1 |
| | for s in sentences |
| | if s |
| | and s.strip() |
| | .lower() |
| | .startswith(("who", "what", "when", "where", "why", "how")) |
| | ) |
| |
|
| | |
| | has_list = 1.0 if n_bullet > 0 or n_numbered > 0 else 0.0 |
| | list_items = n_bullet + n_numbered |
| |
|
| | |
| | emoji_count = len(re.findall(r"[\U00010000-\U0010ffff]", text)) |
| | has_emoji = 1.0 if emoji_count > 0 else 0.0 |
| |
|
| | |
| | |
| | all_caps_words = sum( |
| | 1 for w in words if len(w) > 1 and w.isupper() and w.isalpha() |
| | ) |
| | all_caps_ratio = all_caps_words / n_words |
| |
|
| | |
| | paren_count = len(re.findall(r"\([^)]+\)", text)) |
| | paren_ratio = paren_count / n_sentences |
| |
|
| | |
| | rhetorical_q = sum(1 for s in text.split("\n") if s.strip().endswith("?")) |
| | rhetorical_ratio = rhetorical_q / n_sentences |
| |
|
| | |
| | casual_markers = { |
| | "okay", |
| | "ok", |
| | "hey", |
| | "hi", |
| | "cool", |
| | "awesome", |
| | "wow", |
| | "basically", |
| | "actually", |
| | "literally", |
| | "right", |
| | "yeah", |
| | } |
| | casual_ratio = sum(1 for w in words_lower if w in casual_markers) / n_words |
| |
|
| | |
| | formal_markers = { |
| | "regarding", |
| | "concerning", |
| | "pertaining", |
| | "aforementioned", |
| | "respectively", |
| | "accordingly", |
| | "henceforth", |
| | "whereby", |
| | "notwithstanding", |
| | "pursuant", |
| | } |
| | formal_ratio = sum(1 for w in words_lower if w in formal_markers) / n_words |
| |
|
| | |
| | chinese_chars = len(re.findall(r"[\u4e00-\u9fff]", text)) |
| | has_chinese = 1.0 if chinese_chars > 0 else 0.0 |
| | chinese_ratio = chinese_chars / n_chars |
| |
|
| | |
| | has_self_id_ai = ( |
| | 1.0 |
| | if re.search( |
| | r"\b(I'm|I am)\s+(an?\s+)?(AI|language model|assistant|chatbot)\b", |
| | text, |
| | re.IGNORECASE, |
| | ) |
| | else 0.0 |
| | ) |
| | has_provider_mention = ( |
| | 1.0 |
| | if re.search( |
| | r"\b(Claude|Anthropic|GPT|OpenAI|ChatGPT|Gemini|Google|Bard|Grok|xAI" |
| | r"|DeepSeek|Kimi|Moonshot|Mistral|MiniMax|Zhipu|GLM|深度求索)\b", |
| | text, |
| | re.IGNORECASE, |
| | ) |
| | else 0.0 |
| | ) |
| |
|
| | |
| | ends_with_question = 1.0 if text.rstrip().endswith("?") else 0.0 |
| | has_closing_offer = ( |
| | 1.0 |
| | if re.search( |
| | r"(let me know|feel free|happy to help|don't hesitate|hope this helps)", |
| | text, |
| | re.IGNORECASE, |
| | ) |
| | else 0.0 |
| | ) |
| |
|
| | |
| | commas_per_sentence = text.count(",") / n_sentences |
| |
|
| | |
| | avg_line_len = ( |
| | np.mean([len(l) for l in non_empty_lines]) if non_empty_lines else 0 |
| | ) |
| | short_lines_ratio = ( |
| | sum(1 for l in non_empty_lines if len(l.split()) <= 5) / n_lines |
| | ) |
| |
|
| | |
| | cap_words = len(re.findall(r"\b[A-Z][a-z]+\b", text)) |
| | cap_word_ratio = cap_words / n_words |
| |
|
| | |
| | four_word_phrases = len(re.findall(r"\b\w+\s+\w+\s+\w+\s+\w+\b", text)) |
| | phrase_ratio = four_word_phrases / n_sentences |
| |
|
| | |
| | sent_boundaries = len(re.findall(r"[.!?]\s+[A-Z]", text)) |
| | sent_boundary_ratio = sent_boundaries / n_sentences |
| |
|
| | |
| | has_checkmark = ( |
| | 1.0 if "✓" in text or "✗" in text or "✔" in text or "✘" in text else 0.0 |
| | ) |
| | has_arrow = 1.0 if "→" in text or "←" in text or "➡" in text else 0.0 |
| | has_star = 1.0 if "⭐" in text or "★" in text or "☆" in text else 0.0 |
| | special_unicode = len(re.findall(r"[^\x00-\x7F]", text)) / n_chars |
| |
|
| | |
| | colon_definitions = len(re.findall(r"\b\w+:\s+\w+", text)) / n_sentences |
| |
|
| | |
| | double_quote_pairs = len(re.findall(r'"[^"]*"', text)) / n_sentences |
| | single_quote_pairs = len(re.findall(r"'[^']*'", text)) / n_sentences |
| |
|
| | |
| | greeting_patterns = len( |
| | re.findall( |
| | r"\b(hi|hello|hey|hiya|greetings|howdy|yo)\b", text, re.IGNORECASE |
| | ) |
| | ) |
| | greeting_ratio = greeting_patterns / n_sentences |
| |
|
| | |
| | is_short = 1.0 if n_words < 100 else 0.0 |
| | is_medium = 1.0 if 100 <= n_words < 500 else 0.0 |
| | is_long = 1.0 if n_words >= 500 else 0.0 |
| |
|
| | |
| | excl_sentences = sum(1 for s in sentences if s.strip().endswith("!")) |
| | excl_sentence_ratio = excl_sentences / n_sentences |
| |
|
| | |
| | question_lines = [l for l in non_empty_lines if l.strip().endswith("?")] |
| | question_line_ratio = len(question_lines) / n_lines if n_lines > 0 else 0.0 |
| |
|
| | |
| | conversational_phrases = len( |
| | re.findall( |
| | r"\b(great|perfect|sure|definitely|certainly|absolutely|of course" |
| | r"|no problem|sounds good|got it|understood|okay|alright)\b", |
| | text, |
| | re.IGNORECASE, |
| | ) |
| | ) |
| | conv_phrase_ratio = conversational_phrases / n_words |
| |
|
| | |
| | helpful_phrases = len( |
| | re.findall( |
| | r"\b(let me know|feel free|happy to|glad to|happy to help" |
| | r"|don't hesitate|let me know if|please let me|reach out)\b", |
| | text, |
| | re.IGNORECASE, |
| | ) |
| | ) |
| | helpful_ratio = helpful_phrases / n_sentences |
| |
|
| | return [ |
| | |
| | avg_word_len, |
| | word_len_std, |
| | median_word_len, |
| | avg_sent_len, |
| | |
| | sent_len_std, |
| | sent_len_max, |
| | sent_len_min, |
| | sent_len_median, |
| | sent_len_range, |
| | commas_per_sentence, |
| | |
| | n_commas, |
| | n_semicolons, |
| | n_colons, |
| | n_dash, |
| | n_parens, |
| | n_quotes, |
| | n_exclaim, |
| | n_question, |
| | n_period, |
| | n_ellipsis, |
| | comma_colon_ratio, |
| | comma_period_ratio, |
| | excl_question_ratio, |
| | |
| | n_headers, |
| | n_bold, |
| | n_code_blocks, |
| | n_inline_code, |
| | n_bullet, |
| | n_numbered, |
| | n_tables, |
| | has_list, |
| | |
| | newline_density, |
| | double_newline_ratio, |
| | uppercase_ratio, |
| | digit_ratio, |
| | space_ratio, |
| | unique_chars, |
| | unique_chars_ratio, |
| | list_items, |
| | n_paragraphs, |
| | n_lines / n_sentences, |
| | |
| | has_think, |
| | has_xml, |
| | has_hr, |
| | has_url, |
| | |
| | first_person_ratio, |
| | second_person_ratio, |
| | third_person_ratio, |
| | |
| | ttr, |
| | hapax_ratio, |
| | contraction_ratio, |
| | short_word_ratio, |
| | medium_word_ratio, |
| | |
| | long_word_ratio, |
| | very_long_word_ratio, |
| | |
| | starter_vocab, |
| | and_starts, |
| | but_starts, |
| | so_starts, |
| | the_starts, |
| | it_starts, |
| | |
| | avg_para_len, |
| | para_len_std, |
| | |
| | conjunction_ratio, |
| | discourse_ratio, |
| | hedging_ratio, |
| | certainty_ratio, |
| | transition_ratio, |
| | |
| | question_starts / n_sentences if n_sentences > 0 else 0, |
| | |
| | emoji_count, |
| | has_emoji, |
| | special_unicode, |
| | |
| | all_caps_ratio, |
| | paren_ratio, |
| | rhetorical_ratio, |
| | casual_ratio, |
| | formal_ratio, |
| | has_chinese, |
| | chinese_ratio, |
| | has_self_id_ai, |
| | |
| | has_provider_mention, |
| | ends_with_question, |
| | has_closing_offer, |
| | has_checkmark, |
| | |
| | has_arrow, |
| | has_star, |
| | avg_line_len, |
| | short_lines_ratio, |
| | cap_word_ratio, |
| | phrase_ratio, |
| | |
| | sent_boundary_ratio, |
| | colon_definitions, |
| | double_quote_pairs, |
| | single_quote_pairs, |
| | i_starts, |
| | |
| | greeting_ratio, |
| | is_short, |
| | is_medium, |
| | is_long, |
| | excl_sentence_ratio, |
| | question_line_ratio, |
| | conv_phrase_ratio, |
| | helpful_ratio, |
| | ] |
| |
|
| |
|
| | class FeaturePipeline: |
| | def __init__(self, use_tfidf=True): |
| | word_params = dict(TFIDF_WORD_PARAMS) |
| | char_params = dict(TFIDF_CHAR_PARAMS) |
| |
|
| | if word_params.get("max_features", 1) == 0: |
| | word_params["max_features"] = None |
| | if char_params.get("max_features", 1) == 0: |
| | char_params["max_features"] = None |
| |
|
| | self.word_tfidf = TfidfVectorizer(**word_params) |
| | self.char_tfidf = TfidfVectorizer(**char_params) |
| | self.stylo = StylometricFeatures() |
| | self.scaler = MaxAbsScaler() |
| | self.use_tfidf = use_tfidf and ( |
| | TFIDF_WORD_PARAMS.get("max_features", 1) > 0 |
| | or TFIDF_CHAR_PARAMS.get("max_features", 1) > 0 |
| | ) |
| |
|
| | def _clean_for_tfidf(self, text): |
| | """Strip CoT and markdown for TF-IDF (remove formatting artifacts, keep content).""" |
| | return strip_markdown(strip_cot(text)) |
| |
|
| | def fit_transform(self, texts): |
| | import time |
| |
|
| | print(f" Input: {len(texts)} texts", flush=True) |
| |
|
| | texts_tfidf = [self._clean_for_tfidf(t) for t in texts] |
| | texts_stylo = [strip_markdown(strip_cot(t)) for t in texts] |
| |
|
| | use_word_tfidf = ( |
| | self.word_tfidf.max_features is not None |
| | and self.word_tfidf.max_features > 0 |
| | ) |
| | if use_word_tfidf: |
| | t0 = time.time() |
| | word_features = self.word_tfidf.fit_transform(texts_tfidf) |
| | print( |
| | f" word tfidf: {word_features.shape[1]} features ({time.time() - t0:.1f}s)", |
| | flush=True, |
| | ) |
| | else: |
| | word_features = csr_matrix((len(texts), 0), dtype=np.float32) |
| |
|
| | if self.use_tfidf: |
| | t0 = time.time() |
| | char_features = self.char_tfidf.fit_transform(texts_tfidf) |
| | print( |
| | f" char tfidf: {char_features.shape[1]} features ({time.time() - t0:.1f}s)", |
| | flush=True, |
| | ) |
| | else: |
| | char_features = csr_matrix((len(texts), 0), dtype=np.float32) |
| |
|
| | t0 = time.time() |
| | stylo_features = self.stylo.fit_transform(texts_stylo) |
| | print( |
| | f" stylometric: {stylo_features.shape[1]} features ({time.time() - t0:.1f}s)", |
| | flush=True, |
| | ) |
| |
|
| | combined = hstack([word_features, char_features, stylo_features]) |
| | combined = self.scaler.fit_transform(combined) |
| | print(f" Combined feature matrix: {combined.shape}", flush=True) |
| | return combined |
| |
|
| | def transform(self, texts): |
| | texts_tfidf = [self._clean_for_tfidf(t) for t in texts] |
| | texts_stylo = [strip_markdown(strip_cot(t)) for t in texts] |
| |
|
| | use_word_tfidf = ( |
| | self.word_tfidf.max_features is not None |
| | and self.word_tfidf.max_features > 0 |
| | ) |
| | if use_word_tfidf: |
| | word_features = self.word_tfidf.transform(texts_tfidf) |
| | else: |
| | word_features = csr_matrix((len(texts), 0), dtype=np.float32) |
| |
|
| | if self.use_tfidf: |
| | char_features = self.char_tfidf.transform(texts_tfidf) |
| | else: |
| | char_features = csr_matrix((len(texts), 0), dtype=np.float32) |
| |
|
| | stylo_features = self.stylo.transform(texts_stylo) |
| | combined = hstack([word_features, char_features, stylo_features]) |
| | return self.scaler.transform(combined) |
| |
|