WeatherAPI Historical Data Mining: ML Patterns & Predictive Analytics
Weather data isn’t just about today’s forecast—it’s a treasure trove of patterns stretching back over 15 years that can revolutionise your business intelligence. WeatherAPI’s historical dataset, covering from 1st January 2010 to present, offers developers an unprecedented opportunity to build sophisticated machine learning models that transform weather patterns into actionable business insights.
With over 4 million locations worldwide and sub-hourly granularity, WeatherAPI’s historical endpoint provides the foundation for predictive analytics that can optimise everything from supply chain management to energy consumption forecasting.
Understanding WeatherAPI’s Historical Data Architecture
WeatherAPI’s historical endpoint (/history.json) delivers comprehensive weather data with remarkable depth. Each historical query returns temperature, precipitation, wind patterns, humidity, atmospheric pressure, UV index, and air quality data—all crucial variables for machine learning feature engineering.
import requests
import pandas as pd
from datetime import datetime, timedelta
def fetch_historical_data(api_key, location, start_date, end_date):
"""
Fetch historical weather data for ML preprocessing
"""
base_url = "https://api.weatherapi.com/v1/history.json"
all_data = []
current_date = datetime.strptime(start_date, '%Y-%m-%d')
end_date_obj = datetime.strptime(end_date, '%Y-%m-%d')
while current_date <= end_date_obj:
params = {
'key': api_key,
'q': location,
'dt': current_date.strftime('%Y-%m-%d')
}
response = requests.get(base_url, params=params)
data = response.json()
# Extract hourly data for detailed analysis
for hour in data['forecast']['forecastday'][0]['hour']:
all_data.append({
'datetime': hour['time'],
'temp_c': hour['temp_c'],
'humidity': hour['humidity'],
'wind_kph': hour['wind_kph'],
'wind_dir': hour['wind_dir'],
'pressure_mb': hour['pressure_mb'],
'precip_mm': hour['precip_mm'],
'cloud': hour['cloud'],
'feelslike_c': hour['feelslike_c'],
'uv': hour['uv']
})
current_date += timedelta(days=1)
return pd.DataFrame(all_data)
Feature Engineering for Weather-Based ML Models
Raw weather data requires sophisticated preprocessing to unlock its predictive potential. Seasonal patterns, cyclical trends, and lag features are essential components of effective weather-based machine learning models.
import numpy as np
from sklearn.preprocessing import StandardScaler
from datetime import datetime
class WeatherFeatureEngineer:
def __init__(self):
self.scaler = StandardScaler()
def create_temporal_features(self, df):
"""
Extract temporal patterns crucial for weather ML
"""
df['datetime'] = pd.to_datetime(df['datetime'])
df['hour'] = df['datetime'].dt.hour
df['day_of_year'] = df['datetime'].dt.dayofyear
df['month'] = df['datetime'].dt.month
df['season'] = df['month'].apply(self._get_season)
# Cyclical encoding for temporal features
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
df['day_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
df['day_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365)
return df
def create_lag_features(self, df, target_col, lags=[1, 3, 6, 12, 24]):
"""
Create lag features for time series patterns
"""
for lag in lags:
df[f'{target_col}_lag_{lag}'] = df[target_col].shift(lag)
# Rolling statistics
df[f'{target_col}_rolling_mean_24h'] = df[target_col].rolling(24).mean()
df[f'{target_col}_rolling_std_24h'] = df[target_col].rolling(24).std()
return df
def create_weather_indices(self, df):
"""
Derive composite weather indices
"""
# Heat index approximation
df['heat_index'] = df['temp_c'] + 0.5 * df['humidity'] / 100 * (df['temp_c'] - 14.5)
# Wind chill factor
df['wind_chill'] = 13.12 + 0.6215 * df['temp_c'] - 11.37 * (df['wind_kph'] ** 0.16) + 0.3965 * df['temp_c'] * (df['wind_kph'] ** 0.16)
# Precipitation intensity categories
df['precip_category'] = pd.cut(df['precip_mm'],
bins=[0, 0.1, 2.5, 7.5, float('inf')],
labels=['none', 'light', 'moderate', 'heavy'])
return df
@staticmethod
def _get_season(month):
if month in [12, 1, 2]:
return 'winter'
elif month in [3, 4, 5]:
return 'spring'
elif month in [6, 7, 8]:
return 'summer'
else:
return 'autumn'
Building Predictive Models with Historical Weather Data
Historical weather patterns reveal compelling correlations with business metrics. Retail sales, energy consumption, agricultural yields, and transportation efficiency all exhibit measurable relationships with weather conditions that span multiple years.
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, r2_score
import xgboost as xgb
class WeatherBusinessPredictor:
def __init__(self):
self.models = {
'random_forest': RandomForestRegressor(n_estimators=100, random_state=42),
'gradient_boost': GradientBoostingRegressor(n_estimators=100, random_state=42),
'xgboost': xgb.XGBRegressor(n_estimators=100, random_state=42)
}
self.best_model = None
self.feature_importance = None
def prepare_features(self, weather_df, business_df):
"""
Combine weather data with business metrics
"""
# Merge on datetime
merged_df = pd.merge(weather_df, business_df, on='datetime', how='inner')
# Select weather features for prediction
weather_features = [
'temp_c', 'humidity', 'wind_kph', 'pressure_mb', 'precip_mm',
'cloud', 'uv', 'heat_index', 'wind_chill',
'hour_sin', 'hour_cos', 'day_sin', 'day_cos',
'temp_c_lag_1', 'temp_c_lag_24', 'temp_c_rolling_mean_24h'
]
# Remove rows with NaN values from lag features
merged_df.dropna(inplace=True)
return merged_df[weather_features], merged_df['business_metric']
def train_and_evaluate(self, X, y):
"""
Train multiple models and select the best performer
"""
# Time series cross-validation
tscv = TimeSeriesSplit(n_splits=5)
results = {}
for name, model in self.models.items():
scores = []
for train_idx, val_idx in tscv.split(X):
X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
model.fit(X_train, y_train)
predictions = model.predict(X_val)
score = r2_score(y_val, predictions)
scores.append(score)
results[name] = {
'mean_score': np.mean(scores),
'std_score': np.std(scores)
}
# Select best model
best_model_name = max(results, key=lambda x: results[x]['mean_score'])
self.best_model = self.models[best_model_name]
# Final training on full dataset
self.best_model.fit(X, y)
# Feature importance analysis
if hasattr(self.best_model, 'feature_importances_'):
self.feature_importance = pd.DataFrame({
'feature': X.columns,
'importance': self.best_model.feature_importances_
}).sort_values('importance', ascending=False)
return results, best_model_name
Advanced Pattern Recognition Techniques
WeatherAPI's extensive historical dataset enables sophisticated pattern recognition that goes beyond simple correlations. Seasonal decomposition, anomaly detection, and cluster analysis reveal hidden relationships in weather data that traditional forecasting methods often miss.
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
class WeatherPatternAnalyser:
def __init__(self):
self.seasonal_patterns = {}
self.weather_clusters = None
def seasonal_decomposition(self, weather_data, target_column):
"""
Decompose weather patterns into trend, seasonal, and residual components
"""
from statsmodels.tsa.seasonal import seasonal_decompose
# Resample to daily averages
daily_data = weather_data.set_index('datetime').resample('D')[target_column].mean()
# Perform decomposition
decomposition = seasonal_decompose(daily_data, model='additive', period=365)
self.seasonal_patterns[target_column] = {
'trend': decomposition.trend,
'seasonal': decomposition.seasonal,
'residual': decomposition.resid
}
return decomposition
def identify_weather_regimes(self, weather_data, n_clusters=6):
"""
Identify distinct weather regime patterns using clustering
"""
features = ['temp_c', 'humidity', 'wind_kph', 'pressure_mb', 'precip_mm']
# Prepare data for clustering
cluster_data = weather_data[features].dropna()
# Standardise features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(cluster_data)
# Apply K-means clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(scaled_data)
# Add cluster labels to original data
weather_data = weather_data.copy()
weather_data.loc[cluster_data.index, 'weather_regime'] = clusters
self.weather_clusters = {
'model': kmeans,
'scaler': scaler,
'centres': kmeans.cluster_centers_
}
return weather_data
def detect_weather_anomalies(self, weather_data, contamination=0.1):
"""
Identify unusual weather patterns using isolation forest
"""
from sklearn.ensemble import IsolationForest
features = ['temp_c', 'humidity', 'wind_kph', 'pressure_mb', 'precip_mm', 'uv']
data = weather_data[features].dropna()
# Fit isolation forest
iso_forest = IsolationForest(contamination=contamination, random_state=42)
anomaly_labels = iso_forest.fit_predict(data)
# Mark anomalies in original data
weather_data = weather_data.copy()
weather_data.loc[data.index, 'is_anomaly'] = (anomaly_labels == -1)
return weather_data
Real-World Business Applications
The combination of WeatherAPI's historical data with machine learning creates powerful applications across industries. Retail chains can predict seasonal demand fluctuations, energy companies can optimise grid management, and agricultural businesses can make informed planting decisions based on historical weather patterns.
Consider a renewable energy application that uses 15 years of historical weather data to predict optimal solar panel installation locations:
class RenewableEnergyOptimiser:
def __init__(self, api_key):
self.api_key = api_key
def analyse_solar_potential(self, locations, years_back=5):
"""
Analyse historical solar potential for multiple locations
"""
results = {}
for location in locations:
# Fetch multi-year historical data
end_date = datetime.now()
start_date = end_date - timedelta(days=365 * years_back)
historical_data = self.fetch_solar_data(location, start_date, end_date)
# Calculate solar metrics
solar_metrics = self.calculate_solar_metrics(historical_data)
results[location] = solar_metrics
return results
def calculate_solar_metrics(self, weather_data):
"""
Calculate comprehensive solar energy potential metrics
"""
# Solar irradiance estimation based on cloud cover and UV
weather_data['estimated_irradiance'] = (
weather_data['uv'] * (100 - weather_data['cloud']) / 100
)
# Daily solar potential
daily_solar = weather_data.groupby(
weather_data['datetime'].dt.date
)['estimated_irradiance'].sum()
# Seasonal analysis
weather_data['month'] = weather_data['datetime'].dt.month
monthly_avg = weather_data.groupby('month')['estimated_irradiance'].mean()
return {
'annual_potential': daily_solar.sum(),
'average_daily': daily_solar.mean(),
'seasonal_variation': monthly_avg.std() / monthly_avg.mean(),
'peak_month': monthly_avg.idxmax(),
'low_month': monthly_avg.idxmin(),
'consistency_score': 1 - (daily_solar.std() / daily_solar.mean())
}
Performance Optimisation for Large Datasets
Working with years of historical weather data requires careful attention to performance and memory management. WeatherAPI's efficient data structure and your code optimisation strategies determine the scalability of your machine learning pipeline.
import asyncio
import aiohttp
from concurrent.futures import ThreadPoolExecutor
class OptimisedWeatherDataLoader:
def __init__(self, api_key, max_concurrent=10):
self.api_key = api_key
self.max_concurrent = max_concurrent
async def fetch_batch_historical(self, location_date_pairs):
"""
Asynchronously fetch multiple historical data points
"""
semaphore = asyncio.Semaphore(self.max_concurrent)
async def fetch_single(session, location, date):
async with semaphore:
url = "https://api.weatherapi.com/v1/history.json"
params = {
'key': self.api_key,
'q': location,
'dt': date.strftime('%Y-%m-%d')
}
async with session.get(url, params=params) as response:
return await response.json()
async with aiohttp.ClientSession() as session:
tasks = [
fetch_single(session, location, date)
for location, date in location_date_pairs
]
return await asyncio.gather(*tasks)
def process_data_chunks(self, data, chunk_size=1000):
"""
Process large datasets in manageable chunks
"""
processed_chunks = []
for i in range(0, len(data), chunk_size):
chunk = data[i:i + chunk_size]
# Apply feature engineering to chunk
processed_chunk = self.engineer_features(chunk)
processed_chunks.append(processed_chunk)
return pd.concat(processed_chunks, ignore_index=True)
Getting Started with WeatherAPI Historical Data Mining
Ready to unlock the predictive power of historical weather data? Sign up for your free WeatherAPI account and gain access to our comprehensive historical dataset. With 100,000 free API calls monthly, you can begin building sophisticated machine learning models without any upfront investment.
WeatherAPI's historical endpoint provides the foundation for transformative business intelligence. From demand forecasting to risk assessment, the patterns hidden in 15+ years of weather data are waiting to revolutionise your applications.
Start building your weather-powered ML models today. The insights you discover could reshape your entire business strategy.
