baiganinn commited on
Commit
7382f7a
·
0 Parent(s):

Initial commit with LFS for model file

Browse files
Files changed (6) hide show
  1. .gitattributes +1 -0
  2. .gitignore +44 -0
  3. app.py +277 -0
  4. exoplanet_detector.joblib +3 -0
  5. mapping.py +381 -0
  6. requirements.txt +7 -0
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ *.joblib filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Environment variables
2
+ .env
3
+ .env.local
4
+ .env.*.local
5
+
6
+ # Python cache
7
+ __pycache__/
8
+ *.py[cod]
9
+ *$py.class
10
+ *.so
11
+
12
+ # Distribution / packaging
13
+ .Python
14
+ build/
15
+ develop-eggs/
16
+ dist/
17
+ downloads/
18
+ eggs/
19
+ .eggs/
20
+ lib/
21
+ lib64/
22
+ parts/
23
+ sdist/
24
+ var/
25
+ wheels/
26
+ *.egg-info/
27
+ .installed.cfg
28
+ *.egg
29
+
30
+ # Virtual environments
31
+ venv/
32
+ ENV/
33
+ env/
34
+
35
+ # IDE
36
+ .vscode/
37
+ .idea/
38
+ *.swp
39
+ *.swo
40
+ *~
41
+
42
+ # OS
43
+ .DS_Store
44
+ Thumbs.db
app.py ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio UI для предсказания экзопланет
3
+ """
4
+
5
+ import gradio as gr
6
+ import pandas as pd
7
+ import joblib
8
+ import os
9
+ import time
10
+ from mapping import ColumnMapper, load_training_columns
11
+ from dotenv import load_dotenv
12
+
13
+ # Загружаем переменные окружения из .env файла
14
+ load_dotenv()
15
+
16
+ # Константы
17
+ TRAINING_CSV_PATH = "cumulative_2025.10.03_08.34.41.csv"
18
+ MODEL_PATH = "exoplanet_detector.joblib"
19
+ TOGETHER_API_KEY = os.getenv("TOGETHER_API_KEY", "")
20
+
21
+ # Загружаем модель и колонки тренировочного датасета
22
+ model = joblib.load(MODEL_PATH)
23
+ training_columns = load_training_columns(TRAINING_CSV_PATH)
24
+
25
+ # Инициализируем маппер
26
+ mapper = ColumnMapper(api_key=TOGETHER_API_KEY)
27
+
28
+
29
+ def predict_exoplanets(uploaded_file):
30
+ """
31
+ Process uploaded file and return predictions
32
+
33
+ Args:
34
+ uploaded_file: Uploaded CSV file
35
+
36
+ Returns:
37
+ Tuple (results, mapping info, statistics)
38
+ """
39
+ start_time = time.time()
40
+
41
+ try:
42
+ # Load dataset
43
+ if uploaded_file is None:
44
+ return None, "Error: Please upload a CSV file", None
45
+
46
+ # Read uploaded file
47
+ df_uploaded = pd.read_csv(uploaded_file.name, comment='#')
48
+
49
+ info_msg = f"Loaded rows: {len(df_uploaded)}\n"
50
+ info_msg += f"Columns in uploaded dataset: {len(df_uploaded.columns)}\n\n"
51
+
52
+ # Apply column mapping
53
+ mapping_start = time.time()
54
+ info_msg += "Performing column mapping via Llama...\n\n"
55
+
56
+ df_mapped, mapping, mapping_info = mapper.map_dataset(df_uploaded, training_columns)
57
+
58
+ mapping_time = time.time() - mapping_start
59
+ info_msg += mapping_info + "\n"
60
+ info_msg += f"Mapping time: {mapping_time:.2f} sec\n\n"
61
+
62
+ # Get features expected by the model
63
+ try:
64
+ expected_features = list(model.feature_names_in_)
65
+ info_msg += f"Model expects {len(expected_features)} features\n\n"
66
+ except AttributeError:
67
+ # If feature_names_in_ is not available, use all columns except targets
68
+ target_cols = ['koi_disposition', 'koi_pdisposition']
69
+ expected_features = [col for col in training_columns if col not in target_cols]
70
+ info_msg += f"Using {len(expected_features)} features from training dataset\n\n"
71
+
72
+ # Prepare X with correct columns
73
+ info_msg += f"Creating DataFrame with {len(expected_features)} columns...\n"
74
+
75
+ # Create empty DataFrame with correct columns
76
+ X = pd.DataFrame(index=df_mapped.index, columns=expected_features)
77
+
78
+ # Fill columns that exist in df_mapped
79
+ for col in expected_features:
80
+ if col in df_mapped.columns:
81
+ X[col] = df_mapped[col].values
82
+ else:
83
+ X[col] = 0.0 # Fill missing columns with zeros
84
+
85
+ # Convert all columns to numeric format
86
+ X = X.apply(pd.to_numeric, errors='coerce')
87
+
88
+ # Calculate statistics
89
+ available_cols = [col for col in expected_features if col in df_mapped.columns]
90
+ missing_cols = [col for col in expected_features if col not in df_mapped.columns]
91
+
92
+ if missing_cols:
93
+ info_msg += f"Warning: {len(missing_cols)} columns missing (filled with zeros)\n"
94
+
95
+ info_msg += f"DEBUG: X.shape = {X.shape}, expected: ({len(df_mapped)}, {len(expected_features)})\n"
96
+
97
+ # Fill NaN with mean values
98
+ X = X.fillna(X.mean().fillna(0))
99
+
100
+ info_msg += f"DEBUG: After fillna X.shape = {X.shape}\n"
101
+
102
+ info_msg += f"Data processing: {X.shape}\n"
103
+ info_msg += f" Filled: {len(available_cols)} columns, Added zeros: {len(missing_cols)}\n"
104
+ info_msg += f"Data prepared for model\n\n"
105
+
106
+ # Make predictions
107
+ pred_start = time.time()
108
+
109
+ # Use numpy array instead of DataFrame to bypass feature name checks
110
+ X_values = X.values # Convert to numpy array
111
+
112
+ info_msg += f"DEBUG: X_values.shape = {X_values.shape}\n\n"
113
+
114
+ predictions = model.predict(X_values)
115
+ predictions_proba = model.predict_proba(X_values)
116
+ pred_time = time.time() - pred_start
117
+
118
+ info_msg += f"Predictions completed: {len(predictions)} objects in {pred_time:.2f} sec\n"
119
+
120
+ # Create result dataframe
121
+ df_result = df_uploaded.copy()
122
+
123
+ # Get unique classes from model
124
+ classes = model.classes_
125
+ info_msg += f" Found classes: {list(classes)}\n\n"
126
+
127
+ # Add predictions (text labels)
128
+ df_result['prediction'] = predictions
129
+
130
+ # Add probabilities for each class
131
+ for i, class_name in enumerate(classes):
132
+ df_result[f'confidence_{class_name.replace(" ", "_").lower()}'] = predictions_proba[:, i]
133
+
134
+ # Add mapping information as separate columns
135
+ if mapping:
136
+ for src_col, tgt_col in mapping.items():
137
+ if src_col in df_uploaded.columns and tgt_col in df_mapped.columns:
138
+ df_result[f'mapped_as_{tgt_col}'] = df_uploaded[src_col]
139
+
140
+ # Создаем упрощенный вывод с только важными колонками для отображения
141
+ # Выбираем колонки предсказаний
142
+ display_columns = ['prediction']
143
+ for class_name in classes:
144
+ col_name = f'confidence_{class_name.replace(" ", "_").lower()}'
145
+ if col_name in df_result.columns:
146
+ display_columns.append(col_name)
147
+
148
+ # Add mapped columns (if any)
149
+ mapped_cols = [col for col in df_result.columns if col.startswith('mapped_as_')]
150
+ display_columns.extend(mapped_cols[:10]) # Show first 10 mapped columns
151
+
152
+ # If no mapped columns, add first 5 original columns
153
+ if not mapped_cols and len(df_uploaded.columns) > 0:
154
+ original_cols = [col for col in df_uploaded.columns[:5] if col in df_result.columns]
155
+ display_columns.extend(original_cols)
156
+
157
+ # Create dataframe for display
158
+ df_display = df_result[display_columns].copy()
159
+
160
+ total_time = time.time() - start_time
161
+
162
+ # Create statistics by class
163
+ from collections import Counter
164
+ pred_counts = Counter(predictions)
165
+
166
+ stats_lines = ["**Prediction Statistics:**\n"]
167
+ stats_lines.append(f"* Total objects: {len(predictions)}\n")
168
+
169
+ for class_name in classes:
170
+ count = pred_counts.get(class_name, 0)
171
+ pct = count / len(predictions) * 100 if len(predictions) > 0 else 0
172
+ stats_lines.append(f"* {class_name}: {count} ({pct:.1f}%)\n")
173
+
174
+ stats_lines.append(f"\n**Processing time:** {total_time:.2f} seconds\n")
175
+ stats_lines.append(f"\n**Columns in result:**\n")
176
+ stats_lines.append(f"* All original columns from uploaded file (with original names)\n")
177
+ stats_lines.append(f"* `prediction`: Predicted class ({', '.join(classes)})\n")
178
+
179
+ for class_name in classes:
180
+ col_name = f'confidence_{class_name.replace(" ", "_").lower()}'
181
+ stats_lines.append(f"* `{col_name}`: Probability of class {class_name}\n")
182
+
183
+ stats_lines.append(f"* Columns `mapped_as_*`: Duplicate mapped columns for reference\n")
184
+ stats_lines.append(f"\n**Total columns in result:** {len(df_result.columns)}\n")
185
+
186
+ stats = "".join(stats_lines) + f"""
187
+
188
+ **Mapping completed:** {len(mapping)} columns renamed for model
189
+
190
+ **Full dataset saved:** All {len(df_result.columns)} columns available for download
191
+ """
192
+
193
+ # Save full result to temporary file for download
194
+ output_file = "predictions_result.csv"
195
+ df_result.to_csv(output_file, index=False)
196
+
197
+ # Return simplified output for display and path to full file
198
+ return df_display, info_msg, stats, output_file
199
+
200
+ except Exception as e:
201
+ error_msg = f"Error processing file:\n{str(e)}"
202
+ import traceback
203
+ error_msg += f"\n\n{traceback.format_exc()}"
204
+ return None, error_msg, None, None
205
+
206
+
207
+ # Create Gradio interface
208
+ with gr.Blocks(title="Exoplanet Detector", theme=gr.themes.Soft()) as demo:
209
+ gr.Markdown("""
210
+ # Exoplanet Detector
211
+
212
+ Upload a CSV file with data about exoplanet candidates (KOI - Kepler Objects of Interest).
213
+
214
+ **How it works:**
215
+ 1. Upload your dataset with any column structure
216
+ 2. Llama automatically maps your columns to training columns
217
+ 3. Model makes predictions: exoplanet or false positive
218
+
219
+ **Model:** Random Forest Classifier
220
+ **Mapping:** Llama 3.3 70B via Together AI
221
+
222
+ **Note:** Processing large datasets (>1000 rows) may take several minutes.
223
+ """)
224
+
225
+ with gr.Row():
226
+ with gr.Column(scale=1):
227
+ file_input = gr.File(
228
+ label="Upload CSV file",
229
+ file_types=[".csv"],
230
+ type="filepath"
231
+ )
232
+ submit_btn = gr.Button("Run Prediction", variant="primary", size="lg")
233
+
234
+ with gr.Column(scale=2):
235
+ mapping_info = gr.Textbox(
236
+ label="Column Mapping Information",
237
+ lines=15,
238
+ max_lines=20
239
+ )
240
+
241
+ with gr.Row():
242
+ stats_output = gr.Markdown(label="Statistics")
243
+
244
+ with gr.Row():
245
+ results_output = gr.Dataframe(
246
+ label="Prediction Results (main columns)",
247
+ wrap=True,
248
+ interactive=False
249
+ )
250
+
251
+ with gr.Row():
252
+ download_output = gr.File(
253
+ label="Download full result with all columns",
254
+ interactive=False
255
+ )
256
+
257
+ # Event handler
258
+ submit_btn.click(
259
+ fn=predict_exoplanets,
260
+ inputs=[file_input],
261
+ outputs=[results_output, mapping_info, stats_output, download_output]
262
+ )
263
+
264
+ gr.Markdown("""
265
+ ---
266
+ ### Tips:
267
+ - Make sure your CSV file contains data about stellar systems and their characteristics
268
+ - The more columns match the training dataset, the more accurate the predictions will be
269
+ - Model trained on NASA Exoplanet Archive data (Kepler Mission)
270
+
271
+ ### Example training dataset columns:
272
+ `koi_period`, `koi_depth`, `koi_prad`, `koi_teq`, `koi_insol`, `koi_steff`, `koi_slogg`, `koi_srad`, `ra`, `dec`, `koi_kepmag` etc.
273
+ """)
274
+
275
+ # Launch application
276
+ if __name__ == "__main__":
277
+ demo.launch(share=False, server_name="0.0.0.0", server_port=7860)
exoplanet_detector.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4c9ae4d4bb2830473d74b4fd806ba9545785797e895027504c7cf0085fed11d
3
+ size 6900161
mapping.py ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Модуль для маппинга колонок загруженного датасета на колонки тренировочного датасета
3
+ используя Llama через Together API
4
+ """
5
+
6
+ import pandas as pd
7
+ import os
8
+ import re
9
+ from together import Together
10
+
11
+
12
+ def convert_coordinates_to_degrees(value):
13
+ """
14
+ Конвертирует координаты из формата HMS/DMS в градусы
15
+ Примеры: '07h29m25.85s' -> 112.357708 degrees
16
+ '45d30m15.5s' -> 45.504306 degrees
17
+ """
18
+ if pd.isna(value) or isinstance(value, (int, float)):
19
+ return value
20
+
21
+ value_str = str(value).strip()
22
+
23
+ # Формат HMS (часы:минуты:секунды) для RA
24
+ hms_match = re.match(r'(\d+)h(\d+)m([\d.]+)s?', value_str)
25
+ if hms_match:
26
+ hours = float(hms_match.group(1))
27
+ minutes = float(hms_match.group(2))
28
+ seconds = float(hms_match.group(3))
29
+ return hours * 15 + minutes * 0.25 + seconds * 0.00416667 # 1h = 15°, 1m = 0.25°, 1s = 0.00416667°
30
+
31
+ # Формат DMS (градусы:минуты:секунды) для DEC
32
+ dms_match = re.match(r'([+-]?)(\d+)d(\d+)m([\d.]+)s?', value_str)
33
+ if dms_match:
34
+ sign = -1 if dms_match.group(1) == '-' else 1
35
+ degrees = float(dms_match.group(2))
36
+ minutes = float(dms_match.group(3))
37
+ seconds = float(dms_match.group(4))
38
+ return sign * (degrees + minutes / 60 + seconds / 3600)
39
+
40
+ # Если не распознали формат, возвращаем NaN
41
+ return float('nan')
42
+
43
+
44
+ class ColumnMapper:
45
+ def __init__(self, api_key: str):
46
+ """
47
+ Initialize column mapper
48
+
49
+ Args:
50
+ api_key: API key for Together AI
51
+ """
52
+ self.client = Together(api_key=api_key)
53
+
54
+ # Built-in synonym dictionary (fallback) - significantly expanded
55
+ self.known_synonyms = {
56
+ # Orbital period
57
+ 'pl_orbper': 'koi_period',
58
+ 'orbital_period': 'koi_period',
59
+ 'period': 'koi_period',
60
+ 'pl_orbpererr1': 'koi_period_err1',
61
+ 'pl_orbpererr2': 'koi_period_err2',
62
+ 'pl_orbpererr': 'koi_period_err1',
63
+
64
+ # Transit time/epoch
65
+ 'pl_tranmid': 'koi_time0bk',
66
+ 'transit_time': 'koi_time0bk',
67
+ 'time0': 'koi_time0bk',
68
+ 'epoch': 'koi_time0bk',
69
+ 'pl_tranmiderr1': 'koi_time0bk_err1',
70
+ 'pl_tranmiderr2': 'koi_time0bk_err2',
71
+
72
+ # Transit duration
73
+ 'pl_trandur': 'koi_duration',
74
+ 'pl_trandurh': 'koi_duration',
75
+ 'transit_duration': 'koi_duration',
76
+ 'duration': 'koi_duration',
77
+ 'pl_trandurerr1': 'koi_duration_err1',
78
+ 'pl_trandurerr2': 'koi_duration_err2',
79
+
80
+ # Transit depth
81
+ 'pl_trandep': 'koi_depth',
82
+ 'transit_depth': 'koi_depth',
83
+ 'depth': 'koi_depth',
84
+ 'pl_trandeperr1': 'koi_depth_err1',
85
+ 'pl_trandeperr2': 'koi_depth_err2',
86
+
87
+ # Planet radius
88
+ 'pl_rade': 'koi_prad',
89
+ 'pl_radj': 'koi_prad',
90
+ 'planet_radius': 'koi_prad',
91
+ 'radius': 'koi_prad',
92
+ 'pl_radeerr1': 'koi_prad_err1',
93
+ 'pl_radeerr2': 'koi_prad_err2',
94
+ 'pl_radjerr1': 'koi_prad_err1',
95
+ 'pl_radjerr2': 'koi_prad_err2',
96
+
97
+ # Insolation flux
98
+ 'pl_insol': 'koi_insol',
99
+ 'insolation': 'koi_insol',
100
+ 'insol': 'koi_insol',
101
+ 'pl_insolerr1': 'koi_insol_err1',
102
+ 'pl_insolerr2': 'koi_insol_err2',
103
+
104
+ # Equilibrium temperature
105
+ 'pl_eqt': 'koi_teq',
106
+ 'equilibrium_temp': 'koi_teq',
107
+ 'teq': 'koi_teq',
108
+ 'pl_eqterr1': 'koi_teq_err1',
109
+ 'pl_eqterr2': 'koi_teq_err2',
110
+
111
+ # Stellar effective temperature
112
+ 'st_teff': 'koi_steff',
113
+ 'stellar_teff': 'koi_steff',
114
+ 'star_temp': 'koi_steff',
115
+ 'teff': 'koi_steff',
116
+ 'st_tefferr1': 'koi_steff_err1',
117
+ 'st_tefferr2': 'koi_steff_err2',
118
+
119
+ # Stellar surface gravity
120
+ 'st_logg': 'koi_slogg',
121
+ 'stellar_logg': 'koi_slogg',
122
+ 'surface_gravity': 'koi_slogg',
123
+ 'logg': 'koi_slogg',
124
+ 'st_loggerr1': 'koi_slogg_err1',
125
+ 'st_loggerr2': 'koi_slogg_err2',
126
+
127
+ # Stellar radius
128
+ 'st_rad': 'koi_srad',
129
+ 'stellar_radius': 'koi_srad',
130
+ 'star_radius': 'koi_srad',
131
+ 'st_raderr1': 'koi_srad_err1',
132
+ 'st_raderr2': 'koi_srad_err2',
133
+
134
+ # Stellar mass
135
+ 'st_mass': 'koi_smass',
136
+ 'stellar_mass': 'koi_smass',
137
+ 'st_masserr1': 'koi_smass_err1',
138
+ 'st_masserr2': 'koi_smass_err2',
139
+
140
+ # Kepler magnitude
141
+ 'sy_kepmag': 'koi_kepmag',
142
+ 'kepmag': 'koi_kepmag',
143
+ 'kep_mag': 'koi_kepmag',
144
+ 'sy_kepmaglim': 'koi_kepmag',
145
+
146
+ # Coordinates
147
+ 'ra': 'ra',
148
+ 'ra_deg': 'ra',
149
+ 'rastr': 'ra',
150
+ 'dec': 'dec',
151
+ 'dec_deg': 'dec',
152
+ 'decstr': 'dec',
153
+
154
+ # Model SNR
155
+ 'koi_model_snr': 'koi_model_snr',
156
+ 'snr': 'koi_model_snr',
157
+
158
+ # Impact parameter
159
+ 'pl_imppar': 'koi_impact',
160
+ 'impact': 'koi_impact',
161
+ 'impact_parameter': 'koi_impact',
162
+
163
+ # Additional mappings for error columns
164
+ 'koi_period_err': 'koi_period_err1',
165
+ 'koi_time0bk_err': 'koi_time0bk_err1',
166
+ 'koi_duration_err': 'koi_duration_err1',
167
+ 'koi_depth_err': 'koi_depth_err1',
168
+ 'koi_prad_err': 'koi_prad_err1',
169
+ 'koi_teq_err': 'koi_teq_err1',
170
+ 'koi_insol_err': 'koi_insol_err1',
171
+ 'koi_steff_err': 'koi_steff_err1',
172
+ 'koi_slogg_err': 'koi_slogg_err1',
173
+ 'koi_srad_err': 'koi_srad_err1',
174
+ 'koi_smass_err': 'koi_smass_err1',
175
+ }
176
+
177
+ def get_column_mapping(self, source_columns: list, target_columns: list) -> dict:
178
+ """
179
+ Получает маппинг между колонками источника и целевыми колонками
180
+ используя LLM
181
+
182
+ Args:
183
+ source_columns: Список колонок загруженного датасета
184
+ target_columns: Список колонок тренировочного датасета
185
+
186
+ Returns:
187
+ Словарь маппинга {source_column: target_column}
188
+ """
189
+ # Словарь известных синонимов для точного маппинга
190
+ known_mappings = """
191
+ Common column name mappings (NASA Exoplanet Archive):
192
+ - pl_orbper, orbital_period, period → koi_period (Orbital Period in days)
193
+ - pl_tranmid, transit_time, time0 → koi_time0bk (Transit Epoch in BJD)
194
+ - pl_trandur, pl_trandurh, transit_duration → koi_duration (Transit Duration in hours)
195
+ - pl_trandep, transit_depth, depth → koi_depth (Transit Depth in ppm)
196
+ - pl_rade, planet_radius, radius → koi_prad (Planetary Radius in Earth radii)
197
+ - pl_insol, insolation, insol → koi_insol (Insolation Flux in Earth flux)
198
+ - pl_eqt, equilibrium_temp, teq → koi_teq (Equilibrium Temperature in K)
199
+ - st_teff, stellar_teff, star_temp → koi_steff (Stellar Effective Temperature in K)
200
+ - st_logg, stellar_logg, surface_gravity → koi_slogg (Stellar Surface Gravity in log10(cm/s^2))
201
+ - st_rad, stellar_radius, star_radius → koi_srad (Stellar Radius in Solar radii)
202
+ - st_mass, stellar_mass, star_mass → koi_smass (Stellar Mass in Solar masses)
203
+ - ra, ra_deg → ra (Right Ascension in degrees)
204
+ - dec, dec_deg → dec (Declination in degrees)
205
+ - pl_bmassj, planet_mass → koi_prad (use radius if mass not available)
206
+ - sy_dist, distance → koi_steff (stellar distance - related to stellar properties)
207
+ """
208
+
209
+ prompt = f"""You are an expert in NASA Exoplanet Archive data mapping. Map column names from a source dataset to Kepler/KOI target dataset columns.
210
+
211
+ {known_mappings}
212
+
213
+ Source columns:
214
+ {source_columns}
215
+
216
+ Target columns:
217
+ {target_columns}
218
+
219
+ CRITICAL INSTRUCTIONS:
220
+ 1. Use the known mappings above as your PRIMARY reference
221
+ 2. Match columns based on physical meaning (e.g., "pl_orbper" = orbital period = "koi_period")
222
+ 3. Common prefixes: "pl_" = planet property, "st_" = stellar property, "koi_" = KOI property
223
+ 4. If exact match exists in known mappings, USE IT
224
+ 5. Only map columns with clear semantic similarity
225
+ 6. Return ONLY a Python dictionary: {{"source": "target", ...}}
226
+ 7. NO markdown, NO explanations, NO code blocks - just the dictionary
227
+
228
+ Example: {{"pl_orbper": "koi_period", "st_teff": "koi_steff", "ra": "ra"}}
229
+
230
+ Mapping:"""
231
+
232
+ response = self.client.chat.completions.create(
233
+ model="meta-llama/Llama-3.3-70B-Instruct-Turbo",
234
+ messages=[{"role": "user", "content": prompt}],
235
+ temperature=0.1,
236
+ max_tokens=2000
237
+ )
238
+
239
+ mapping_str = response.choices[0].message.content.strip()
240
+
241
+ # Очистка ответа от возможных markdown блоков
242
+ if "```" in mapping_str:
243
+ mapping_str = mapping_str.split("```")[1]
244
+ if mapping_str.startswith("python"):
245
+ mapping_str = mapping_str[6:]
246
+ mapping_str = mapping_str.strip()
247
+
248
+ # Преобразование строки в словарь
249
+ try:
250
+ mapping = eval(mapping_str)
251
+ if not isinstance(mapping, dict):
252
+ raise ValueError("Response is not a dictionary")
253
+ except Exception as e:
254
+ print(f"Error parsing mapping: {e}")
255
+ print(f"Raw response: {mapping_str}")
256
+ # Возвращаем пустой маппинг в случае ошибки
257
+ mapping = {}
258
+
259
+ # Supplement mapping with known synonyms (fallback)
260
+ # Check source columns that were not mapped by Llama
261
+ unmapped_sources = [col for col in source_columns if col not in mapping]
262
+
263
+ for src_col in unmapped_sources:
264
+ src_lower = src_col.lower()
265
+
266
+ # Check exact match with known synonyms
267
+ if src_lower in self.known_synonyms:
268
+ target = self.known_synonyms[src_lower]
269
+ if target in target_columns:
270
+ mapping[src_col] = target
271
+ continue
272
+
273
+ # Check for partial matches (more sophisticated)
274
+ # Remove common prefixes/suffixes for comparison
275
+ src_clean = src_lower.replace('pl_', '').replace('st_', '').replace('sy_', '').replace('koi_', '')
276
+
277
+ for known_src, known_tgt in self.known_synonyms.items():
278
+ known_clean = known_src.replace('pl_', '').replace('st_', '').replace('sy_', '').replace('koi_', '')
279
+
280
+ # Check if core part matches
281
+ if src_clean == known_clean or known_clean in src_clean or src_clean in known_clean:
282
+ if known_tgt in target_columns:
283
+ mapping[src_col] = known_tgt
284
+ break
285
+
286
+ # If still not mapped, try fuzzy matching on target columns
287
+ if src_col not in mapping:
288
+ for tgt_col in target_columns:
289
+ tgt_clean = tgt_col.replace('koi_', '')
290
+ # Check if source contains target name
291
+ if tgt_clean in src_lower or src_clean == tgt_clean:
292
+ mapping[src_col] = tgt_col
293
+ break
294
+
295
+ return mapping
296
+
297
+ def apply_mapping(self, df: pd.DataFrame, mapping: dict) -> pd.DataFrame:
298
+ """
299
+ Применяет маппинг к датафрейму
300
+
301
+ Args:
302
+ df: Исходный датафрейм
303
+ mapping: Словарь маппинга
304
+
305
+ Returns:
306
+ Датафрейм с переименованными колонками
307
+ """
308
+ # Переименовываем только те колонки, которые есть в маппинге
309
+ df_mapped = df.copy()
310
+
311
+ # Проверяем какие колонки из маппинга действительно есть в датафрейме
312
+ valid_mapping = {k: v for k, v in mapping.items() if k in df.columns}
313
+
314
+ if valid_mapping:
315
+ df_mapped = df_mapped.rename(columns=valid_mapping)
316
+
317
+ return df_mapped
318
+
319
+ def map_dataset(self, uploaded_df: pd.DataFrame, target_columns: list) -> tuple:
320
+ """
321
+ Полный процесс маппинга датасета
322
+
323
+ Args:
324
+ uploaded_df: Загруженный датафрейм
325
+ target_columns: Список колонок тренировочного датасета
326
+
327
+ Returns:
328
+ Кортеж (mapped_dataframe, mapping_dict, info_message)
329
+ """
330
+ # Копируем датафрейм чтобы не изменять оригинал
331
+ df_work = uploaded_df.copy()
332
+
333
+ # Конвертируем координаты в градусы если они в текстовом формате
334
+ coord_columns = [col for col in df_work.columns if any(
335
+ keyword in col.lower() for keyword in ['ra', 'dec', 'coord', 'right_ascension', 'declination']
336
+ )]
337
+
338
+ for col in coord_columns:
339
+ # Check first non-empty value
340
+ first_val = df_work[col].dropna().iloc[0] if len(df_work[col].dropna()) > 0 else None
341
+ if first_val and isinstance(first_val, str) and ('h' in first_val or 'd' in first_val):
342
+ # Convert entire column
343
+ df_work[col] = df_work[col].apply(convert_coordinates_to_degrees)
344
+
345
+ source_columns = df_work.columns.tolist()
346
+
347
+ # Get mapping via LLM
348
+ mapping = self.get_column_mapping(source_columns, target_columns)
349
+
350
+ # Apply mapping
351
+ mapped_df = self.apply_mapping(df_work, mapping)
352
+
353
+ # Create info message
354
+ if mapping:
355
+ info_msg = f"Successfully mapped {len(mapping)} columns:\n"
356
+ for src, tgt in mapping.items():
357
+ info_msg += f" * {src} -> {tgt}\n"
358
+ else:
359
+ info_msg = "Warning: No mapping performed - no matches found between columns\n"
360
+ info_msg += f"Source columns: {', '.join(source_columns[:5])}...\n"
361
+
362
+ # Check which target columns are missing
363
+ missing_cols = set(target_columns) - set(mapped_df.columns)
364
+ if missing_cols:
365
+ info_msg += f"\nWarning: Missing {len(missing_cols)} target columns (will be filled with NaN)\n"
366
+
367
+ return mapped_df, mapping, info_msg
368
+
369
+
370
+ def load_training_columns(csv_path: str) -> list:
371
+ """
372
+ Load column names from training dataset
373
+
374
+ Args:
375
+ csv_path: Path to training dataset CSV file
376
+
377
+ Returns:
378
+ List of column names
379
+ """
380
+ df = pd.read_csv(csv_path, comment='#', nrows=1)
381
+ return df.columns.tolist()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio==4.44.0
2
+ pandas==2.2.2
3
+ scikit-learn==1.5.1
4
+ joblib==1.4.2
5
+ together
6
+ python-dotenv==1.0.1
7
+ numpy==1.26.4