Arif commited on
Commit
267ad85
Β·
1 Parent(s): 99363ed

Updated the analysis part to get all the format and null values

Browse files
backend/app/api/v1/router.py CHANGED
@@ -128,18 +128,20 @@ async def analyze_data(request: AnalysisRequest):
128
 
129
  logger.info(f"πŸ“Š Analysis: {request.analysis_type} on {len(request.data)} rows")
130
 
131
- results = analyzer.analyze(
 
132
  request.data,
133
  request.analysis_type,
134
  request.columns
135
  )
136
 
137
- summary = analyzer.generate_summary(results)
 
138
 
 
139
  return AnalysisResponse(
140
  analysis_type=request.analysis_type,
141
  results=results,
142
- summary=summary,
143
  timestamp=datetime.now()
144
  )
145
  except ValueError as e:
@@ -150,6 +152,7 @@ async def analyze_data(request: AnalysisRequest):
150
  raise HTTPException(status_code=500, detail=str(e))
151
 
152
 
 
153
  # ============ ML Suggestions Endpoint ============
154
 
155
  @router.post("/suggestions", response_model=SuggestionsResponse)
 
128
 
129
  logger.info(f"πŸ“Š Analysis: {request.analysis_type} on {len(request.data)} rows")
130
 
131
+ # βœ… KEY FIX 1: Add await - analyzer.analyze() is async
132
+ results = await analyzer.analyze(
133
  request.data,
134
  request.analysis_type,
135
  request.columns
136
  )
137
 
138
+ # βœ… KEY FIX 2: Remove this line - generate_summary() doesn't exist
139
+ # Don't call: summary = analyzer.generate_summary(results)
140
 
141
+ # βœ… KEY FIX 3: Return without summary field
142
  return AnalysisResponse(
143
  analysis_type=request.analysis_type,
144
  results=results,
 
145
  timestamp=datetime.now()
146
  )
147
  except ValueError as e:
 
152
  raise HTTPException(status_code=500, detail=str(e))
153
 
154
 
155
+
156
  # ============ ML Suggestions Endpoint ============
157
 
158
  @router.post("/suggestions", response_model=SuggestionsResponse)
backend/app/services/analyzer.py CHANGED
@@ -1,250 +1,209 @@
1
- """
2
- Data analysis service - statistical, trend, correlation analysis
3
- """
4
- from typing import List, Dict, Any
5
  import logging
6
- import statistics
 
 
7
 
8
- try:
9
- import numpy as np
10
- import pandas as pd
11
- from scipy.stats import skew, kurtosis
12
- HAS_SCIPY = True
13
- except ImportError:
14
- HAS_SCIPY = False
15
 
16
 
17
  class Analyzer:
18
- """Perform statistical and trend analysis on data"""
19
 
20
  def __init__(self):
21
- self.logger = logging.getLogger(__name__)
 
 
 
 
 
22
 
23
- def analyze(self, data: List[Dict[str, Any]], analysis_type: str, columns: List[str] = None) -> Dict[str, Any]:
24
- """Dispatch to appropriate analysis method"""
25
- analysis_type = analysis_type.lower()
 
 
 
 
26
 
27
- if analysis_type == "statistical":
28
- return self.statistical_analysis(data, columns)
29
- elif analysis_type == "correlation":
30
- return self.correlation_analysis(data, columns)
31
- elif analysis_type == "trend":
32
- return self.trend_analysis(data, columns)
33
- elif analysis_type == "outliers":
34
- return self.outlier_analysis(data, columns)
35
- elif analysis_type == "distribution":
36
- return self.distribution_analysis(data, columns)
37
- elif analysis_type == "summary":
38
- return self.summary_analysis(data)
 
 
 
39
  else:
40
- raise ValueError(f"Unknown analysis type: {analysis_type}")
41
-
42
- def statistical_analysis(self, data: List[Dict[str, Any]], columns: List[str] = None) -> Dict[str, Any]:
43
- """Statistical analysis - mean, median, std, min, max"""
44
- try:
45
- numeric_cols = columns or self._get_numeric_columns(data)
46
- results = {
47
- "mean": {},
48
- "median": {},
49
- "std_dev": {},
50
- "min": {},
51
- "max": {},
52
- "count": len(data)
53
- }
54
-
55
- for col in numeric_cols:
56
- values = [row[col] for row in data if row[col] is not None]
57
- if values:
58
- results["mean"][col] = round(statistics.mean(values), 2)
59
- results["median"][col] = round(statistics.median(values), 2)
60
- if len(values) > 1:
61
- results["std_dev"][col] = round(statistics.stdev(values), 2)
62
- results["min"][col] = round(min(values), 2)
63
- results["max"][col] = round(max(values), 2)
64
-
65
- return results
66
- except Exception as e:
67
- self.logger.error(f"Statistical analysis failed: {e}")
68
- raise
69
 
70
- def correlation_analysis(self, data: List[Dict[str, Any]], columns: List[str] = None) -> Dict[str, Any]:
71
- """Correlation analysis between numeric columns"""
72
  try:
73
- if not HAS_SCIPY:
74
- raise RuntimeError("pandas and scipy required for correlation analysis")
75
-
76
- df = pd.DataFrame(data)
77
- numeric_cols = columns or df.select_dtypes(include=[np.number]).columns.tolist()
78
 
79
- corr_matrix = df[numeric_cols].corr().round(2)
80
-
81
- # Find significant correlations
82
- significant_pairs = []
83
- for i, col1 in enumerate(numeric_cols):
84
- for col2 in numeric_cols[i+1:]:
85
- corr_value = corr_matrix.loc[col1, col2]
86
- if abs(corr_value) > 0.5: # Threshold
87
- significant_pairs.append({
88
- "col1": col1,
89
- "col2": col2,
90
- "correlation": float(corr_value)
91
- })
92
 
 
93
  return {
94
- "matrix": corr_matrix.to_dict(),
95
- "significant_pairs": significant_pairs
 
96
  }
97
  except Exception as e:
98
- self.logger.error(f"Correlation analysis failed: {e}")
99
  raise
100
 
101
- def trend_analysis(self, data: List[Dict[str, Any]], columns: List[str] = None) -> Dict[str, Any]:
102
- """Trend analysis - increasing, decreasing, stable"""
103
  try:
104
- numeric_cols = columns or self._get_numeric_columns(data)
105
  trends = {}
106
- trend_strength = {}
107
 
108
- for col in numeric_cols:
109
- values = [row[col] for row in data if row[col] is not None]
110
- if len(values) > 2:
111
- # Simple trend: compare first half vs second half
112
- mid = len(values) // 2
113
- first_half_avg = statistics.mean(values[:mid])
114
- second_half_avg = statistics.mean(values[mid:])
115
-
116
- if second_half_avg > first_half_avg * 1.05:
117
- trends[col] = "increasing"
118
- strength = (second_half_avg - first_half_avg) / first_half_avg
119
- elif second_half_avg < first_half_avg * 0.95:
120
- trends[col] = "decreasing"
121
- strength = (first_half_avg - second_half_avg) / first_half_avg
122
- else:
123
- trends[col] = "stable"
124
- strength = 0.0
125
-
126
- trend_strength[col] = round(strength, 2)
 
 
 
 
 
 
127
 
 
128
  return {
129
- "trends": trends,
130
- "trend_strength": trend_strength
 
131
  }
132
  except Exception as e:
133
- self.logger.error(f"Trend analysis failed: {e}")
134
  raise
135
 
136
- def outlier_analysis(self, data: List[Dict[str, Any]], columns: List[str] = None) -> Dict[str, Any]:
137
- """Outlier detection using IQR method"""
138
  try:
139
- numeric_cols = columns or self._get_numeric_columns(data)
140
  outliers = {}
141
- total_outliers = 0
142
 
143
- for col in numeric_cols:
144
- values = sorted([row[col] for row in data if row[col] is not None])
145
- if len(values) > 4:
146
- q1 = values[len(values) // 4]
147
- q3 = values[3 * len(values) // 4]
148
- iqr = q3 - q1
149
- lower_bound = q1 - 1.5 * iqr
150
- upper_bound = q3 + 1.5 * iqr
151
 
152
- col_outliers = [v for v in values if v < lower_bound or v > upper_bound]
153
- outliers[col] = col_outliers
154
- total_outliers += len(col_outliers)
 
 
 
 
 
 
 
 
 
 
 
155
 
 
156
  return {
157
- "outliers": outliers,
158
- "outlier_count": total_outliers,
159
- "outlier_percentage": round((total_outliers / len(data)) * 100, 2) if data else 0
160
  }
161
  except Exception as e:
162
- self.logger.error(f"Outlier analysis failed: {e}")
163
  raise
164
 
165
- def distribution_analysis(self, data: List[Dict[str, Any]], columns: List[str] = None) -> Dict[str, Any]:
166
- """Distribution analysis - skewness, kurtosis"""
167
  try:
168
- if not HAS_SCIPY:
169
- return {"error": "scipy required for distribution analysis"}
170
 
171
- numeric_cols = columns or self._get_numeric_columns(data)
172
- distributions = {}
173
- skewness = {}
174
- kurt = {}
 
 
 
175
 
176
- for col in numeric_cols:
177
- values = [row[col] for row in data if row[col] is not None]
178
- if len(values) > 2:
179
- distributions[col] = {
180
- "min": round(min(values), 2),
181
- "max": round(max(values), 2),
182
- "range": round(max(values) - min(values), 2)
183
- }
184
- skewness[col] = round(float(skew(values)), 2)
185
- kurt[col] = round(float(kurtosis(values)), 2)
186
 
187
- return {
188
- "distributions": distributions,
189
- "skewness": skewness,
190
- "kurtosis": kurt
191
- }
192
- except Exception as e:
193
- self.logger.error(f"Distribution analysis failed: {e}")
194
- raise
195
-
196
- def summary_analysis(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
197
- """Summary of data"""
198
- try:
199
- if not data:
200
- return {"error": "No data"}
201
 
202
- cols = list(data.keys())
203
  return {
204
- "total_rows": len(data),
205
- "total_columns": len(cols),
206
- "columns": cols,
207
- "data_types": self._infer_types(data)
 
 
208
  }
209
  except Exception as e:
210
- self.logger.error(f"Summary analysis failed: {e}")
211
  raise
212
-
213
- @staticmethod
214
- def _get_numeric_columns(data: List[Dict[str, Any]]) -> List[str]:
215
- """Get numeric columns"""
216
- if not data:
217
- return []
218
-
219
- numeric = []
220
- for key in data.keys():
221
- try:
222
- for row in data:
223
- if row[key] is not None:
224
- float(row[key])
225
- numeric.append(key)
226
- except (ValueError, TypeError):
227
- pass
228
- return numeric
229
-
230
- @staticmethod
231
- def _infer_types(data: List[Dict[str, Any]]) -> Dict[str, str]:
232
- """Infer column data types"""
233
- types = {}
234
- if not data:
235
- return types
236
-
237
- for key in data.keys():
238
- try:
239
- for row in data:
240
- if row[key] is not None:
241
- float(row[key])
242
- types[key] = "numeric"
243
- except (ValueError, TypeError):
244
- types[key] = "string"
245
-
246
- return types
247
-
248
- def generate_summary(self, results: Dict[str, Any]) -> str:
249
- """Generate human-readable summary"""
250
- return f"Analysis completed with {len(results)} metrics."
 
1
+ """Data analysis service"""
 
 
 
2
  import logging
3
+ from typing import Dict, List, Any
4
+ import pandas as pd
5
+ import numpy as np
6
 
7
+ logger = logging.getLogger(__name__)
 
 
 
 
 
 
8
 
9
 
10
  class Analyzer:
11
+ """Service for analyzing data"""
12
 
13
  def __init__(self):
14
+ self.supported_types = [
15
+ "statistical_summary",
16
+ "trend_detection",
17
+ "outlier_detection",
18
+ "correlation_analysis"
19
+ ]
20
 
21
+ async def analyze(
22
+ self,
23
+ data: List[Dict],
24
+ analysis_type: str,
25
+ columns: List[str] = None
26
+ ) -> Dict[str, Any]:
27
+ """Perform data analysis"""
28
 
29
+ logger.info(f"πŸ“Š Starting analysis: {analysis_type}")
30
+
31
+ # Validate analysis type
32
+ if analysis_type not in self.supported_types:
33
+ raise ValueError(
34
+ f"Unknown analysis type: {analysis_type}. "
35
+ f"Supported types: {', '.join(self.supported_types)}"
36
+ )
37
+
38
+ # Convert to DataFrame
39
+ df = pd.DataFrame(data)
40
+
41
+ # Select columns if specified
42
+ if columns:
43
+ numeric_columns = [col for col in columns if col in df.columns]
44
  else:
45
+ numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
46
+
47
+ logger.info(f"Analyzing columns: {numeric_columns}")
48
+
49
+ # Route to appropriate analysis
50
+ if analysis_type == "statistical_summary":
51
+ return await self._statistical_summary(df, numeric_columns)
52
+ elif analysis_type == "trend_detection":
53
+ return await self._trend_detection(df, numeric_columns)
54
+ elif analysis_type == "outlier_detection":
55
+ return await self._outlier_detection(df, numeric_columns)
56
+ elif analysis_type == "correlation_analysis":
57
+ return await self._correlation_analysis(df, numeric_columns)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
+ async def _statistical_summary(self, df: pd.DataFrame, columns: List[str]) -> Dict[str, Any]:
60
+ """Generate statistical summary"""
61
  try:
62
+ results = {}
 
 
 
 
63
 
64
+ for col in columns:
65
+ if pd.api.types.is_numeric_dtype(df[col]):
66
+ results[col] = {
67
+ "mean": float(df[col].mean()),
68
+ "median": float(df[col].median()),
69
+ "std": float(df[col].std()),
70
+ "min": float(df[col].min()),
71
+ "max": float(df[col].max()),
72
+ "count": int(df[col].count())
73
+ }
 
 
 
74
 
75
+ logger.info(f"βœ… Statistical summary complete for {len(results)} columns")
76
  return {
77
+ "type": "statistical_summary",
78
+ "results": results,
79
+ "rows_analyzed": len(df)
80
  }
81
  except Exception as e:
82
+ logger.error(f"❌ Statistical summary failed: {e}")
83
  raise
84
 
85
+ async def _trend_detection(self, df: pd.DataFrame, columns: List[str]) -> Dict[str, Any]:
86
+ """Detect trends in data"""
87
  try:
 
88
  trends = {}
 
89
 
90
+ for col in columns:
91
+ if pd.api.types.is_numeric_dtype(df[col]):
92
+ values = df[col].dropna().values
93
+ if len(values) > 1:
94
+ # Simple trend: compare first half vs second half
95
+ mid = len(values) // 2
96
+ first_half_mean = np.mean(values[:mid])
97
+ second_half_mean = np.mean(values[mid:])
98
+
99
+ if second_half_mean > first_half_mean:
100
+ trend = "increasing"
101
+ trend_strength = ((second_half_mean - first_half_mean) / first_half_mean * 100) if first_half_mean != 0 else 0
102
+ elif second_half_mean < first_half_mean:
103
+ trend = "decreasing"
104
+ trend_strength = ((first_half_mean - second_half_mean) / first_half_mean * 100) if first_half_mean != 0 else 0
105
+ else:
106
+ trend = "stable"
107
+ trend_strength = 0
108
+
109
+ trends[col] = {
110
+ "trend": trend,
111
+ "strength": float(trend_strength),
112
+ "first_half_avg": float(first_half_mean),
113
+ "second_half_avg": float(second_half_mean)
114
+ }
115
 
116
+ logger.info(f"βœ… Trend detection complete for {len(trends)} columns")
117
  return {
118
+ "type": "trend_detection",
119
+ "results": trends,
120
+ "rows_analyzed": len(df)
121
  }
122
  except Exception as e:
123
+ logger.error(f"❌ Trend detection failed: {e}")
124
  raise
125
 
126
+ async def _outlier_detection(self, df: pd.DataFrame, columns: List[str]) -> Dict[str, Any]:
127
+ """Detect outliers in data"""
128
  try:
 
129
  outliers = {}
 
130
 
131
+ for col in columns:
132
+ if pd.api.types.is_numeric_dtype(df[col]):
133
+ Q1 = df[col].quantile(0.25)
134
+ Q3 = df[col].quantile(0.75)
135
+ IQR = Q3 - Q1
 
 
 
136
 
137
+ lower_bound = Q1 - 1.5 * IQR
138
+ upper_bound = Q3 + 1.5 * IQR
139
+
140
+ outlier_mask = (df[col] < lower_bound) | (df[col] > upper_bound)
141
+ outlier_count = outlier_mask.sum()
142
+ outlier_indices = df[outlier_mask].index.tolist()
143
+
144
+ outliers[col] = {
145
+ "count": int(outlier_count),
146
+ "percentage": float(outlier_count / len(df) * 100),
147
+ "lower_bound": float(lower_bound),
148
+ "upper_bound": float(upper_bound),
149
+ "outlier_indices": outlier_indices[:10] # First 10
150
+ }
151
 
152
+ logger.info(f"βœ… Outlier detection complete for {len(outliers)} columns")
153
  return {
154
+ "type": "outlier_detection",
155
+ "results": outliers,
156
+ "rows_analyzed": len(df)
157
  }
158
  except Exception as e:
159
+ logger.error(f"❌ Outlier detection failed: {e}")
160
  raise
161
 
162
+ async def _correlation_analysis(self, df: pd.DataFrame, columns: List[str]) -> Dict[str, Any]:
163
+ """Analyze correlations between columns"""
164
  try:
165
+ # Get numeric data
166
+ numeric_df = df[columns].select_dtypes(include=[np.number])
167
 
168
+ if len(numeric_df.columns) < 2:
169
+ return {
170
+ "type": "correlation_analysis",
171
+ "results": {},
172
+ "message": "Need at least 2 numeric columns for correlation analysis",
173
+ "rows_analyzed": len(df)
174
+ }
175
 
176
+ # Calculate correlation matrix
177
+ corr_matrix = numeric_df.corr()
 
 
 
 
 
 
 
 
178
 
179
+ # Find strong correlations
180
+ strong_correlations = []
181
+ for i in range(len(corr_matrix.columns)):
182
+ for j in range(i+1, len(corr_matrix.columns)):
183
+ col_i = corr_matrix.columns[i]
184
+ col_j = corr_matrix.columns[j]
185
+ corr_value = corr_matrix.iloc[i, j]
186
+
187
+ if abs(corr_value) > 0.5: # Strong correlation threshold
188
+ strong_correlations.append({
189
+ "column_1": col_i,
190
+ "column_2": col_j,
191
+ "correlation": float(corr_value)
192
+ })
193
 
194
+ logger.info(f"βœ… Correlation analysis complete with {len(strong_correlations)} strong correlations")
195
  return {
196
+ "type": "correlation_analysis",
197
+ "results": {
198
+ "strong_correlations": strong_correlations,
199
+ "correlation_matrix": corr_matrix.to_dict()
200
+ },
201
+ "rows_analyzed": len(df)
202
  }
203
  except Exception as e:
204
+ logger.error(f"❌ Correlation analysis failed: {e}")
205
  raise
206
+
207
+
208
+ # Global analyzer instance
209
+ analyzer = Analyzer()