| import pandas as pd |
| import numpy as np |
| import torch |
|
|
| import os |
| project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
| data_dir = os.path.join(project_root, "data") |
| if not os.path.exists(data_dir): |
| os.makedirs(data_dir) |
| CACHE_FILE = os.path.join(data_dir, "vfv_market_data.csv") |
|
|
| WINDOW_SIZE = 15 |
|
|
| def get_processed_tensors(): |
| """ |
| Reads the yfinance CSV, cleans MultiIndex headers, |
| and converts prices into normalized 15-element windows. |
| """ |
| if not os.path.exists(CACHE_FILE): |
| print(f"Error: {CACHE_FILE} not found. Run your fetcher script first.") |
| return None |
|
|
| |
| |
| df = pd.read_csv(CACHE_FILE, header=[0, 1], index_col=0, parse_dates=True) |
|
|
| |
| |
| df.columns = df.columns.get_level_values(0) |
|
|
| |
| |
| prices = pd.to_numeric(df['Close'], errors='coerce').dropna().values |
| |
| if len(prices) < WINDOW_SIZE + 1: |
| print(f"Error: Not enough data. Need at least {WINDOW_SIZE + 1} points.") |
| return None |
|
|
| |
| |
| |
| log_returns = np.log(prices[1:] / prices[:-1]) |
|
|
| |
| windows = [] |
| for i in range(len(log_returns) - WINDOW_SIZE + 1): |
| window = log_returns[i : i + WINDOW_SIZE] |
| |
| |
| |
| |
| mu = np.mean(window) |
| std = np.std(window) |
| |
| if std > 1e-9: |
| norm_window = (window - mu) / std |
| else: |
| norm_window = window - mu |
| |
| windows.append(norm_window) |
|
|
| |
| return torch.tensor(np.array(windows), dtype=torch.float32) |
|
|
| if __name__ == "__main__": |
| tensors = get_processed_tensors() |
| if tensors is not None: |
| print("--- Processing Complete ---") |
| print(f"Tensor Shape: {tensors.shape}") |
| print("\nFirst Window Example:") |
| print(tensors[0]) |
| print("\nLatest Window Example (Last 15 minutes):") |
| print(tensors[-1]) |