import pandas as pd
xl
gdp_df = pd.read_excel('path_to_your_file/API_NY.GDP.MKTP.CD_DS2_en_excel_v2_3254827.xls', skiprows=3, engine='openpyxl')
unemployment_df = pd.read_excel('path_to_your_file/API_SL.UEM.TOTL.NE.ZS_DS2_en_excel_v2_3139629.xls', skiprows=3, engine='openpyxl')
inflation_df = pd.read_excel('path_to_your_file/API_FP.CPI.TOTL.ZG_DS2_en_excel_v2_3255161.xls', skiprows=3, engine='openpyto 2024
years = [str(year) for year in range(2018, 2025)]
gdp_filtered = gdp_df[['Country Name', 'Country Code'] + years]
unemployment_filtered = unemployment_df[['Country Name', 'Country Code'] + years]
inflation_filtered = inflation_df[['Country Name', 'Country Code'te the metric
gdp_filtered = gdp_filtered.melt(id_vars=['Country Name', 'Country Code'], var_name='Year', value_name='GDP')
unemployment_filtered = unemployment_filtered.melt(id_vars=['Country Name', 'Country Code'], var_name='Year', value_name='Unemployment Rate')
inflation_filtered = inflation_filtered.melt(id_vars=['Country Name', 'Country Code'], var_name='Year', value_natry Code, and Year
combined_df = pd.merge(gdp_filtered, unemployment_filtered, on=['Country Name', 'Country Code', 'Year'])
combined_df = pd.merge(combined_df, inflation_filtered, on=['Country Name', 'Counfor better readability
combined_df = combined_df.sort_values(by=['mbined data to a CSV file
output_file_path = 'combined_economic_data_2018_2024.csv'
combined_df.to_csv(output_file_path, index=False)
tput_file_path, index=False)

import ccxt
import pandas as pd
from datetime import datetime, timedelta

exchange = ccxt.binance()


top_100_symbols = [
    'BTC', 'ETH', 'SOL', 'BNB', 'XRP', 'ADA', 'DOGE', 'DOT', 'UNI', 'LTC',
    'LINK', 'BCH', 'MATIC', 'ALGO', 'VET', 'XLM', 'ATOM', 'AVAX', 'FTT', 'TRX',
    'SHIB', 'ETC', 'FIL', 'XMR', 'THETA', 'EOS', 'AAVE', 'KSM', 'NEO', 'XTZ',
    'MKR', 'DASH', 'ZIL', 'COMP', 'YFI', 'RUNE', 'SNX', 'ENJ', 'BAT', 'MANA',
    'GRT', '1INCH', 'SUSHI', 'CELO', 'ZRX', 'WAVES', 'OMG', 'ONT', 'QTUM',
    'BTT', 'CHZ', 'IOST', 'ICX', 'ZEN', 'SC', 'UMA', 'KNC', 'BAL', 'BAND',
    'ANKR', 'DGB', 'HNT', 'OCEAN', 'RSR', 'ZEC', 'KLAY', 'CKB', 'ASTR', 'BNX',
    'LUNA', 'FTM', 'HBAR', 'ICP', 'KDA', 'QNT', 'FLOW', 'AXS', 'NEAR', 'RAY',
    'DODO', 'PUNDIX', 'TOMO', 'XEM', 'HOT', 'STMX', 'IOTX', 'LPT', 'SXP',
    'PLA', 'AKRO', 'BZRX', 'STORJ', 'OXT', 'MITH', 'KAVA', 'ROSE', 'CHR', 'CTK',
    'FTT', 'LUNA'
]

def fetch_ohlcv(symbol, since):
    market_symbol = f"{symbol}/USDT"
    all_ohlcv = []
    while True:
        try:
            ohlcv = exchange.fetch_ohlcv(market_symbol, timeframe='1d', since=since, limit=1000)
            if not ohlcv:
                break
            since = ohlcv[-1][0] + 86400000  
            all_ohlcv.extend(ohlcv)
            
            if len(ohlcv) < 1000:
                break
        except Exception as e:
            print(f"Error fetching data for {market_symbol}: {e}")
            break
    
    df = pd.DataFrame(all_ohlcv, columns=['timestamp', 'open', 'high', 'low', 'close', 'volume'])
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
    df['symbol'] = symbol
    return df


all_data = pd.DataFrame()

# Define the start date
start_date = '2010-01-01'
since_timestamp = int(datetime.strptime(start_date, '%Y-%m-%d').timestamp() * 1000)

for symbol in top_100_symbols:
    print(f"Fetching data for {symbol}...")
    df = fetch_ohlcv(symbol, since_timestamp)
    all_data = pd.concat([all_data, df], ignore_index=True)


output_path = 'crypto_data.csv'
all_data.to_csv(output_path, index=False)


print(f"Data collection process completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Data has been saved to the file: {output_path}")

import pandas as pd
import ta 

ccxt_data = pd.read_csv('crypto_data.csv')
coingecko_data = pd.read_excel('all_coins_historical_data_COINGEKO.xlsx')
uniswap_data = pd.read_excel('uniswap_pools_data_filtered.xlsx')

ccxt_data['timestamp'] = pd.to_datetime(ccxt_data['timestamp'])
coingecko_data['timestamp'] = pd.to_datetime(coingecko_data['timestamp'])
uniswap_data['created_at'] = pd.to_datetime(uniswap_data['created_at'])

uniswap_data.rename(columns={'created_at': 'timestamp'}, inplace=True)

ccxt_data_filtered = ccxt_data
coingecko_data_filtered = coingecko_data
uniswap_data_filtered = uniswap_data

ccxt_data_filtered.drop_duplicates(subset=['symbol', 'timestamp'], inplace=True)
coingecko_data_filtered.drop_duplicates(subset=['symbol', 'timestamp'], inplace=True)
uniswap_data_filtered.drop_duplicates(subset=['pair', 'timestamp'], inplace=True)

merged_data = pd.merge(ccxt_data_filtered, coingecko_data_filtered, on=['symbol', 'timestamp'], how='outer')

if 'pair' not in merged_data.columns:
    merged_data['pair'] = None

merged_data = pd.merge(merged_data, uniswap_data_filtered, on=['pair', 'timestamp'], how='outer')

merged_data.fillna(0, inplace=True)

merged_data.drop_duplicates(inplace=True)

merged_data['RSI'] = ta.momentum.RSIIndicator(close=merged_data['close'], window=14).rsi()
merged_data['MACD'] = ta.trend.MACD(close=merged_data['close']).macd()
merged_data['MACD_signal'] = ta.trend.MACD(close=merged_data['close']).macd_signal()
merged_data['Bollinger_High'] = ta.volatility.BollingerBands(close=merged_data['close']).bollinger_hband()
merged_data['Bollinger_Low'] = ta.volatility.BollingerBands(close=merged_data['close']).bollinger_lband()
merged_data['SMA'] = ta.trend.SMAIndicator(close=merged_data['close'], window=20).sma_indicator()
merged_data['EMA'] = ta.trend.EMAIndicator(close=merged_data['close'], window=20).ema_indicator()
merged_data['ATR'] = ta.volatility.AverageTrueRange(high=merged_data['high'], low=merged_data['low'], close=merged_data['close'], window=14).average_true_range()
merged_data['OBV'] = ta.volume.OnBalanceVolumeIndicator(close=merged_data['close'], volume=merged_data['volume']).on_balance_volume()


merged_data['Market_Cap'] = merged_data['market_cap']

merged_data['Transaction_Volume'] = merged_data['total_volume']
merged_data['Active_Addresses'] = 0  

merged_data['momentum'] = merged_data['close'].pct_change(periods=5)  
merged_data['volatility'] = merged_data['close'].rolling(window=20).std() 


columns_to_drop = ['price', 'current_price', 'circulating_supply', 'total_supply', 'max_supply', 'ath', 'ath_date', 'atl', 'atl_date']
merged_data.drop(columns=columns_to_drop, inplace=True)

merged_data.to_csv('cleaned_merged_crypto_data.csv', index=False)

print("Data merging and cleaning complete. Saved to 'cleaned_merged_crypto_data.csv'.")

import pandas as pd
import numpy as np

data = pd.read_excel('final_reduced_v3.xlsx')

data['close_lag1'] = data['close'].shift(1)
data['RSI_lag1'] = data['RSI'].shift(1)
data['MACD_hist_lag1'] = data['MACD_hist'].shift(1)
data['Volatility_lag1'] = data['Volatility'].shift(1)

data['close_rolling_mean_7'] = data['close'].rolling(window=7).mean()
data['close_rolling_std_7'] = data['close'].rolling(window=7).std()
data['RSI_rolling_mean_14'] = data['RSI'].rolling(window=14).mean()
data['Momentum_rolling_mean_14'] = data['Momentum'].rolling(window=14).mean()

data['SMA_50'] = data['close'].rolling(window=50).mean()
data['EMA_50'] = data['close'].ewm(span=50, adjust=False).mean()
data['Bollinger_band_width'] = data['MACD_hist'] - data['ATR']  

data['RoC_5'] = data['close'].pct_change(periods=5) * 100
data['RSI_derivative'] = data['RSI'].diff()


data['Historical_volatility_14'] = data['close'].pct_change().rolling(window=14).std() * np.sqrt(14)
data['ATR_rolling_mean_14'] = data['ATR'].rolling(window=14).mean()


data['RSI_Momentum'] = data['RSI'] * data['Momentum']
data['Volatility_google_trends'] = data['Volatility'] * data['google_trends']


data.dropna(inplace=True)

final_feature_engineered_file_path = 'final_feature_engineered.xlsx'
data.to_excel(final_feature_engineered_file_path, index=False)

print(f"Final dataset with engineered features saved at: {final_feature_engineered_file_path}")

import pandas as pd

file_path = r"C:\Users\admin\Desktop\Desights.ai\final_feature_engineered.xlsx" 

data = pd.read_excel(file_path)

print(data.head())

print(data.info())

print(data.describe())

   timestamp       close symbol        RSI  MACD_hist         ATR  \
0 2018-01-10    122.2390    NEO  75.039440   3.208903   15.567603   
1 2018-01-11  13238.7800    BTC   0.000000   0.000000    0.000000   
2 2018-01-11   1138.9300    ETH  68.513962  31.373431  144.634302   
3 2018-01-11     21.2024    BNB  70.124396   1.249393    3.431163   
4 2018-01-11    223.7700    LTC  49.449340  -8.252057   40.244965   

            OBV  Momentum  Volatility  google_trends  ...  \
0  5.090000e+14   47.3860   18.837929             81  ...   
1  3.819381e+04 -141.2200    0.000000             13  ...   
2 -3.662706e+05  383.9400  199.219031              4  ...   
3  5.798660e+08   12.7524    5.468572              4  ...   
4  6.130000e+11    1.1600   18.433306              4  ...   

   Momentum_rolling_mean_14       SMA_50       EMA_50  Bollinger_band_width  \
0                125.608071  3326.001258  4496.626759            -12.358700   
1                 87.007357  3323.176858  4839.456297              0.000000   
2                113.768236  3330.855658  4694.337619           -113.260870   
3                113.651979  3331.110706  4511.077414             -2.181770   
4                111.718550  3331.133906  4342.947712            -48.497022   

       RoC_5  RSI_derivative  Historical_volatility_14  ATR_rolling_mean_14  \
0  -3.520916       19.411833                173.755969            38.911361   
1 -11.191386      -75.039440                192.004323            30.624546   
2  -8.666399       68.513962                191.992766            40.750103   
3  21.156571        1.610434                193.707072            38.072776   
4  -9.635343      -20.675057                192.386858            39.950837   

   RSI_Momentum  Volatility_google_trends  
0   3555.818924               1525.872279  
1      0.000000                  0.000000  
2  26305.250651                796.876125  
3    894.254354                 21.874289  
4     57.361234                 73.733223  

[5 rows x 30 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141141 entries, 0 to 141140
Data columns (total 30 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   timestamp                 141141 non-null  datetime64[ns]
 1   close                     141141 non-null  float64       
 2   symbol                    141141 non-null  object        
 3   RSI                       141141 non-null  float64       
 4   MACD_hist                 141141 non-null  float64       
 5   ATR                       141141 non-null  float64       
 6   OBV                       141141 non-null  float64       
 7   Momentum                  141141 non-null  float64       
 8   Volatility                141141 non-null  float64       
 9   google_trends             141141 non-null  int64         
 10  GDP                       141141 non-null  int64         
 11  Unemployment Rate         141141 non-null  float64       
 12  Inflation                 141141 non-null  float64       
 13  close_lag1                141141 non-null  float64       
 14  RSI_lag1                  141141 non-null  float64       
 15  MACD_hist_lag1            141141 non-null  float64       
 16  Volatility_lag1           141141 non-null  float64       
 17  close_rolling_mean_7      141141 non-null  float64       
 18  close_rolling_std_7       141141 non-null  float64       
 19  RSI_rolling_mean_14       141141 non-null  float64       
 20  Momentum_rolling_mean_14  141141 non-null  float64       
 21  SMA_50                    141141 non-null  float64       
 22  EMA_50                    141141 non-null  float64       
 23  Bollinger_band_width      141141 non-null  float64       
 24  RoC_5                     141141 non-null  float64       
 25  RSI_derivative            141141 non-null  float64       
 26  Historical_volatility_14  141141 non-null  float64       
 27  ATR_rolling_mean_14       141141 non-null  float64       
 28  RSI_Momentum              141141 non-null  float64       
 29  Volatility_google_trends  141141 non-null  float64       
dtypes: datetime64[ns](1), float64(26), int64(2), object(1)
memory usage: 32.3+ MB
None
                           timestamp          close            RSI  \
count                         141141  141141.000000  141141.000000   
mean   2021-11-08 20:29:00.659340800     557.316603      48.880557   
min              2018-01-10 00:00:00       0.000006       0.000000   
25%              2020-12-05 00:00:00       0.138980      40.151591   
50%              2021-12-25 00:00:00       1.155000      47.883968   
75%              2022-12-23 00:00:00      12.852000      57.129270   
max              2023-12-31 00:00:00   82885.120000      99.999973   
std                              NaN    3983.406050      12.996818   

           MACD_hist            ATR           OBV       Momentum  \
count  141141.000000  141141.000000  1.411410e+05  141141.000000   
mean        0.075663      40.837938  3.825230e+14       2.046276   
min     -4223.879665       0.000000 -4.010000e+14  -39284.000000   
25%        -0.008164       0.012086  2.330000e+14      -0.115100   
50%         0.000118       0.114476  5.090000e+14      -0.000062   
75%         0.019165       1.287853  5.120000e+14       0.097400   
max      2364.597895   13797.132470  5.770000e+14   33068.540000   
std        65.532118     337.234361  2.213789e+14     621.525913   

          Volatility  google_trends           GDP  ...  \
count  141141.000000  141141.000000  1.411410e+05  ...   
mean       36.519021       9.742952  2.444846e+13  ...   
min         0.000000       0.000000  2.065650e+13  ...   
25%         0.008899       0.000000  2.152140e+13  ...   
50%         0.082536       0.000000  2.359400e+13  ...   
75%         0.990429       6.000000  2.574410e+13  ...   
max     15296.343100     100.000000  2.736090e+13  ...   
std       318.334438      20.478331  2.300278e+12  ...   

       Momentum_rolling_mean_14         SMA_50         EMA_50  \
count             141141.000000  141141.000000  141141.000000   
mean                   2.051938     557.880833     558.113831   
min                -3054.030357       2.502042      38.984136   
25%                   -2.272088     194.305677     208.689573   
50%                    0.013411     481.889751     402.391479   
75%                    2.682039     788.894405     747.686592   
max                 2419.537475    3331.721946    4839.456297   
std                  174.567680     459.983917     487.550534   

       Bollinger_band_width         RoC_5  RSI_derivative  \
count         141141.000000  1.411410e+05   141141.000000   
mean             -40.762275  2.376688e+06       -0.000092   
min           -17535.006935 -9.999998e+01      -85.439376   
25%               -1.186467 -9.561334e+01       -5.838056   
50%               -0.102691  9.659864e+00        0.001050   
75%               -0.010107  1.789211e+03        5.800366   
max              111.290272  2.562338e+09       85.323856   
std              352.705632  5.276069e+07       12.272949   

       Historical_volatility_14  ATR_rolling_mean_14  RSI_Momentum  \
count              1.411410e+05        141141.000000  1.411410e+05   
mean               3.646581e+05            40.839709  9.912880e+02   
min                7.599889e+00             0.019622 -1.682891e+06   
25%                3.245239e+02             0.566371 -4.537754e+00   
50%                2.552923e+03             2.250487 -2.463583e-03   
75%                1.371411e+04            33.097520  5.415655e+00   
max                6.017348e+08          1088.655717  2.650318e+06   
std                6.149541e+06            89.572033  3.542575e+04   

       Volatility_google_trends  
count             141141.000000  
mean                 222.298623  
min                    0.000000  
25%                    0.000000  
50%                    0.000000  
75%                    2.013019  
max               126306.974952  
std                 2672.871022  

[8 rows x 29 columns]

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')
df = pd.read_excel(r"C:\Users\admin\Desktop\Desights.ai\final_feature_engineered.xlsx", parse_dates=['timestamp'])

features = df.drop(columns=['timestamp', 'close', 'symbol'])
target = df['close']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

y_pred_linear = linear_model.predict(X_test)


mae_linear = mean_absolute_error(y_test, y_pred_linear)
rmse_linear = np.sqrt(mean_squared_error(y_test, y_pred_linear))
r2_linear = r2_score(y_test, y_pred_linear)

print(f'Linear Regression MAE: {mae_linear}')
print(f'Linear Regression RMSE: {rmse_linear}')
print(f'Linear Regression R2: {r2_linear}')

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)


mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)

print(f'Random Forest MAE: {mae_rf}')
print(f'Random Forest RMSE: {rmse_rf}')
print(f'Random Forest R2: {r2_rf}')

xgb_model = XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)


mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f'XGBoost MAE: {mae_xgb}')
print(f'XGBoost RMSE: {rmse_xgb}')
print(f'XGBoost R2: {r2_xgb}')

models = ['Linear Regression', 'Random Forest', 'XGBoost']
mae_scores = [mae_linear, mae_rf, mae_xgb]
rmse_scores = [rmse_linear, rmse_rf, rmse_xgb]
r2_scores = [r2_linear, r2_rf, r2_xgb]


plt.figure(figsize=(10, 6))

plt.subplot(1, 3, 1)
sns.barplot(x=models, y=mae_scores)
plt.title('MAE Comparison')

plt.subplot(1, 3, 2)
sns.barplot(x=models, y=rmse_scores)
plt.title('RMSE Comparison')

plt.subplot(1, 3, 3)
sns.barplot(x=models, y=r2_scores)
plt.title('R2 Score Comparison')

plt.tight_layout()
plt.show()

Linear Regression MAE: 490.1010177433609
Linear Regression RMSE: 1575.3819188594143
Linear Regression R2: 0.861422343752859
Random Forest MAE: 3.6691326794656773
Random Forest RMSE: 43.114119494283095
Random Forest R2: 0.9998962087981835
XGBoost MAE: 11.915671255454997
XGBoost RMSE: 79.57343428194504
XGBoost R2: 0.9996464444144895

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_excel(r"C:\Users\admin\Desktop\Desights.ai\final_feature_engineered.xlsx", parse_dates=['timestamp'])


features = df.drop(columns=['timestamp', 'close', 'symbol'])
target = df['close']


X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


y_pred_rf = rf_model.predict(X_test)


mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)

print(f'Random Forest MAE: {mae_rf}')
print(f'Random Forest RMSE: {rmse_rf}')
print(f'Random Forest R²: {r2_rf}')

df['predicted_close_rf'] = rf_model.predict(features)


print(df[['symbol', 'timestamp', 'close', 'predicted_close_rf']].head())

df[['symbol', 'timestamp', 'close', 'predicted_close_rf']].to_csv('predicted_prices_rf.csv', index=False)

plt.figure(figsize=(12, 6))
plt.plot(df['timestamp'], df['close'], label='Actual Prices', color='green')
plt.plot(df['timestamp'], df['predicted_close_rf'], label='Predicted Prices', color='blue')
plt.title('Actual vs Predicted Prices')
plt.xlabel('Timestamp')
plt.ylabel('Price')
plt.legend()
plt.show()

Random Forest MAE: 3.6691326794656773
Random Forest RMSE: 43.114119494283095
Random Forest R²: 0.9998962087981835
  symbol  timestamp       close  predicted_close_rf
0    NEO 2018-01-10    122.2390         1212.764326
1    BTC 2018-01-11  13238.7800        11740.859823
2    ETH 2018-01-11   1138.9300         1275.334628
3    BNB 2018-01-11     21.2024         1110.719052
4    LTC 2018-01-11    223.7700          236.233000

import matplotlib.pyplot as plt
import seaborn as sns

importances_rf = rf_model.feature_importances_
indices_rf = np.argsort(importances_rf)[::-1]

plt.figure(figsize=(12, 8))
sns.barplot(x=importances_rf[indices_rf], y=X_train.columns[indices_rf])
plt.title('Random Forest Feature Importances')
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.show()

from sklearn.inspection import PartialDependenceDisplay
import matplotlib.pyplot as plt

features_to_plot = ['ATR', 'close_rolling_std_7']

fig, ax = plt.subplots(figsize=(12, 8))
PartialDependenceDisplay.from_estimator(rf_model, X_train, features=features_to_plot, ax=ax)

plt.suptitle('Partial Dependence Plots for ATR and close_rolling_std_7', fontsize=16)
plt.subplots_adjust(top=0.9) 
plt.show()

from sklearn.linear_model import LinearRegression
import pandas as pd
import matplotlib.pyplot as plt

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

coefficients = pd.Series(lr_model.coef_, index=X_train.columns)

plt.figure(figsize=(12, 8))
coefficients.sort_values().plot(kind='barh')
plt.title('Linear Regression Coefficients')
plt.xlabel('Coefficient Value')
plt.ylabel('Features')
plt.show()

import pandas as pd
import matplotlib.pyplot as plt

lr_coefficients = pd.Series(lr_model.coef_, index=X_train.columns)

rf_importances = pd.Series(rf_model.feature_importances_, index=X_train.columns)

lr_coefficients_normalized = lr_coefficients / lr_coefficients.abs().max()
rf_importances_normalized = rf_importances / rf_importances.max()

comparison_df = pd.DataFrame({
    'Linear Regression Coefficients': lr_coefficients_normalized,
    'Random Forest Importances': rf_importances_normalized
})
comparison_df.plot(kind='barh', figsize=(12, 10))
plt.title('Comparison of Linear Regression Coefficients and Random Forest Importances')
plt.xlabel('Normalized Value')
plt.ylabel('Features')
plt.show()

import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

file_path = r"C:\Users\admin\Desktop\Desights.ai\final_feature_engineered.xlsx"
df = pd.read_excel(file_path)

X = df.drop(columns=['close'])  
y = df['close'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numeric_X_train = X_train.select_dtypes(include=[float, int])

correlation_matrix = numeric_X_train.corr()

plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm")
plt.title('Correlation Matrix')
plt.show()