|
| 1 | +import pandas as pd |
| 2 | +import json |
| 3 | +import os |
| 4 | +import joblib |
| 5 | +from sklearn.preprocessing import MinMaxScaler |
| 6 | + |
| 7 | +raw_csv_path = "/home/marco/prometheus_data.csv" |
| 8 | +processed_csv_path = "/home/marco/processed_prometheus_data.csv" |
| 9 | +scaled_csv_path = "/home/marco/processed_prometheus_data_scaled.csv" |
| 10 | +scaler_path = "/home/marco/scaler.pkl" |
| 11 | + |
| 12 | + |
| 13 | +if os.path.exists(raw_csv_path): |
| 14 | + df = pd.read_csv(raw_csv_path) |
| 15 | +else: |
| 16 | + df = pd.DataFrame() |
| 17 | + |
| 18 | +def extract_metric(json_file): |
| 19 | + try: |
| 20 | + with open(json_file) as f: |
| 21 | + data = json.load(f) |
| 22 | + if data['data']['result']: |
| 23 | + return float(data['data']['result'][0]['value'][1]) |
| 24 | + except: |
| 25 | + return None |
| 26 | + |
| 27 | +cpu_usage = extract_metric("/home/marco/cpu_usage.json") |
| 28 | +memory_usage = extract_metric("/home/marco/memory_usage.json") |
| 29 | +disk_read = extract_metric("/home/marco/disk_read.json") |
| 30 | +disk_write = extract_metric("/home/marco/disk_write.json") |
| 31 | +network_receive = extract_metric("/home/marco/net_receive.json") |
| 32 | +network_transmit = extract_metric("/home/marco/net_transmit.json") |
| 33 | + |
| 34 | +new_row = { |
| 35 | + 'timestamp': pd.Timestamp.now(), |
| 36 | + 'cpu_usage': cpu_usage, |
| 37 | + 'memory_usage': memory_usage, |
| 38 | + 'disk_read': disk_read, |
| 39 | + 'disk_write': disk_write, |
| 40 | + 'network_receive': network_receive, |
| 41 | + 'network_transmit': network_transmit, |
| 42 | +} |
| 43 | + |
| 44 | +new_row_df = pd.DataFrame([new_row]) |
| 45 | + |
| 46 | +if df.empty: |
| 47 | + df = new_row_df |
| 48 | +else: |
| 49 | + df = pd.concat([df, new_row_df], ignore_index=True) |
| 50 | + |
| 51 | + |
| 52 | +df['timestamp'] = pd.to_datetime(df['timestamp']) |
| 53 | + |
| 54 | + |
| 55 | +df['cpu_change'] = df['cpu_usage'].diff() |
| 56 | +df['memory_change'] = df['memory_usage'].diff() |
| 57 | +df['disk_read_speed'] = df['disk_read'].diff() |
| 58 | +df['disk_write_speed'] = df['disk_write'].diff() |
| 59 | +df['network_receive_speed'] = df['network_receive'].diff() |
| 60 | +df['network_transmit_speed'] = df['network_transmit'].diff() |
| 61 | + |
| 62 | +df['cpu_rolling'] = df['cpu_usage'].rolling(window=5).mean() |
| 63 | +df['memory_std'] = df['memory_usage'].rolling(window=5).std() |
| 64 | + |
| 65 | +df['cpu_mem_ratio'] = df['cpu_usage'] / df['memory_usage'] |
| 66 | +df['disk_cpu_ratio'] = df['disk_read'] / (df['cpu_usage'] + 1) |
| 67 | + |
| 68 | +df['high_cpu'] = (df['cpu_usage'] > 80).astype(int) |
| 69 | +df['high_memory'] = (df['memory_usage'] > 80).astype(int) |
| 70 | + |
| 71 | + |
| 72 | +df['hour'] = df['timestamp'].dt.hour |
| 73 | +df['day_of_week'] = df['timestamp'].dt.weekday |
| 74 | + |
| 75 | + |
| 76 | +df.to_csv(raw_csv_path, index=False) |
| 77 | + |
| 78 | + |
| 79 | +cleaned_df = df.copy() |
| 80 | + |
| 81 | +cleaned_df.dropna(how="all", inplace=True) |
| 82 | +cleaned_df.fillna(method="ffill", inplace=True) |
| 83 | +cleaned_df.fillna(cleaned_df.median(), inplace=True) |
| 84 | + |
| 85 | +for col in ['disk_read', 'disk_write', 'network_receive', 'network_transmit']: |
| 86 | + cleaned_df[col] = cleaned_df[col].replace(0, cleaned_df[col].median()) |
| 87 | + |
| 88 | +valid_days = pd.date_range(end=pd.Timestamp.today().date(), periods=7).strftime("%Y-%m-%d").tolist() |
| 89 | +cleaned_df = cleaned_df[cleaned_df['timestamp'].dt.date.astype(str).isin(valid_days)] |
| 90 | + |
| 91 | + |
| 92 | +cleaned_df.to_csv(processed_csv_path, index=False) |
| 93 | + |
| 94 | +print(f"Data cleaned and processed successfully! Saved to: {processed_csv_path}") |
| 95 | +print(f"Last Timestamp in CSV: {cleaned_df['timestamp'].max()}") |
| 96 | + |
| 97 | + |
| 98 | +if os.path.exists(scaler_path): |
| 99 | + scaler = joblib.load(scaler_path) |
| 100 | + df_scaled = cleaned_df.copy() |
| 101 | + |
| 102 | + |
| 103 | + df_scaled_no_ts = df_scaled.drop(columns=["timestamp"]) |
| 104 | + |
| 105 | + df_scaled_no_ts[df_scaled_no_ts.columns] = scaler.transform(df_scaled_no_ts) |
| 106 | + |
| 107 | + df_scaled[df_scaled_no_ts.columns] = df_scaled_no_ts |
| 108 | + |
| 109 | + df_scaled.to_csv(scaled_csv_path, index=False) |
| 110 | + print(f"Scaled data updated and saved to: {scaled_csv_path}") |
| 111 | +else: |
| 112 | + print("Warning: Scaler not found.") |
0 commit comments