Source code for tempdataset.core.datasets.energy

"""
Energy dataset generator.

Generates realistic energy consumption and production data with smart meter readings
including electricity, gas, solar, and wind energy sources.
"""

import random
import string
from datetime import datetime, timedelta
from typing import List, Dict, Any

from .base import BaseDataset
from ..utils.faker_utils import get_faker_utils


[docs] class EnergyDataset(BaseDataset): """ Energy dataset generator that creates realistic energy meter data. Generates energy consumption and production data including: - Reading identification (reading_id, timestamp, meter_id) - Location and energy source information - Consumption and production metrics - Cost calculations and tariff plans - Peak demand and outage tracking - CO2 emissions calculations """ def __init__(self, rows: int = 500): """ Initialize the EnergyDataset generator. Args: rows: Number of rows to generate (default: 500) """ super().__init__(rows) self.faker_utils = get_faker_utils() # Initialize data for consistent generation self._init_data_lists() # Counter for sequential IDs self._reading_counter = 1 def _init_data_lists(self) -> None: """Initialize predefined data lists for realistic generation.""" # Energy sources self.energy_sources = ['Electricity', 'Gas', 'Solar', 'Wind'] # Tariff plans self.tariff_plans = ['Standard', 'Time-of-Use', 'Peak/Off-Peak'] # US states and cities self.locations = [ 'New York, NY', 'Los Angeles, CA', 'Chicago, IL', 'Houston, TX', 'Phoenix, AZ', 'Philadelphia, PA', 'San Antonio, TX', 'San Diego, CA', 'Dallas, TX', 'San Jose, CA', 'Austin, TX', 'Jacksonville, FL', 'Fort Worth, TX', 'Columbus, OH', 'Charlotte, NC', 'San Francisco, CA', 'Indianapolis, IN', 'Seattle, WA', 'Denver, CO', 'Washington, DC' ] # CO2 emission factors (kg CO2 per kWh) by energy source self.co2_factors = { 'Electricity': 0.4, # Grid average 'Gas': 0.2, 'Solar': 0.0, 'Wind': 0.0 } # Typical tariff rates (USD per kWh) by plan self.tariff_rates = { 'Standard': 0.12, 'Time-of-Use': {'peak': 0.18, 'off_peak': 0.08}, 'Peak/Off-Peak': {'peak': 0.20, 'off_peak': 0.06} }
[docs] def generate(self) -> List[Dict[str, Any]]: """ Generate energy dataset rows. Returns: List of dictionaries representing energy meter readings """ if self.seed is not None: random.seed(self.seed) self.faker_utils.set_seed(self.seed) data = [] for i in range(self.rows): row = self._generate_row() data.append(row) return data
def _generate_row(self) -> Dict[str, Any]: """Generate a single energy meter reading row.""" # Generate timestamp (within last 30 days) end_time = datetime.now() start_time = end_time - timedelta(days=30) date_part = self.faker_utils.date_between(start_time, end_time) # Convert date to datetime with random time timestamp = datetime.combine(date_part, datetime.min.time()) + timedelta( hours=random.randint(0, 23), minutes=random.randint(0, 59), seconds=random.randint(0, 59) ) # Select energy source and location energy_source = random.choice(self.energy_sources) location = random.choice(self.locations) # Generate consumption based on energy source and time consumption_kwh = self._get_consumption(energy_source, timestamp) # Generate production (mainly for renewable sources) production_kwh = self._get_production(energy_source, timestamp) # Calculate net usage net_usage_kwh = consumption_kwh - production_kwh # Select tariff plan and calculate cost tariff_plan = random.choice(self.tariff_plans) cost_usd = self._calculate_cost(consumption_kwh, tariff_plan, timestamp) # Generate peak demand peak_demand_kw = consumption_kwh * random.uniform(0.8, 1.5) # Generate outage data (5% chance of outage) outage_flag = random.random() < 0.05 outage_duration_min = random.randint(5, 240) if outage_flag else 0 # Calculate CO2 emissions co2_emissions_kg = consumption_kwh * self.co2_factors[energy_source] return { 'reading_id': self._generate_reading_id(), 'timestamp': timestamp.strftime('%Y-%m-%d %H:%M:%S'), 'meter_id': self._generate_meter_id(), 'location': location, 'energy_source': energy_source, 'consumption_kwh': round(consumption_kwh, 2), 'production_kwh': round(production_kwh, 2), 'net_usage_kwh': round(net_usage_kwh, 2), 'cost_usd': round(cost_usd, 2), 'tariff_plan': tariff_plan, 'peak_demand_kw': round(peak_demand_kw, 2), 'outage_flag': outage_flag, 'outage_duration_min': outage_duration_min, 'co2_emissions_kg': round(co2_emissions_kg, 3) } def _generate_reading_id(self) -> str: """ Generate reading ID in format "ENG-YYYY-NNNNNN". Returns: Formatted reading ID """ year = datetime.now().year reading_num = str(self._reading_counter).zfill(6) self._reading_counter += 1 return f"ENG-{year}-{reading_num}" def _generate_meter_id(self) -> str: """ Generate meter ID in format "MTR-AAANNN". Returns: Formatted meter ID """ letters = ''.join(random.choices(string.ascii_uppercase, k=3)) numbers = ''.join(random.choices(string.digits, k=3)) return f"MTR-{letters}{numbers}" def _get_consumption(self, energy_source: str, timestamp: datetime) -> float: """ Get energy consumption based on source and time. Args: energy_source: Type of energy source timestamp: Current timestamp Returns: Energy consumption in kWh """ hour = timestamp.hour month = timestamp.month # Base consumption patterns if energy_source == 'Electricity': # Higher consumption during peak hours and summer/winter base_consumption = 25 if hour in [7, 8, 9, 17, 18, 19, 20]: # Peak hours base_consumption *= 1.5 if month in [6, 7, 8, 12, 1, 2]: # Summer/Winter base_consumption *= 1.3 return base_consumption + random.uniform(-5, 10) elif energy_source == 'Gas': # Higher in winter months base_consumption = 15 if month in [11, 12, 1, 2, 3]: # Winter base_consumption *= 2 return base_consumption + random.uniform(-3, 8) elif energy_source in ['Solar', 'Wind']: # Lower consumption as these are mainly production sources return random.uniform(2, 8) return random.uniform(10, 30) def _get_production(self, energy_source: str, timestamp: datetime) -> float: """ Get energy production based on source and time. Args: energy_source: Type of energy source timestamp: Current timestamp Returns: Energy production in kWh """ hour = timestamp.hour month = timestamp.month if energy_source == 'Solar': # Production only during daylight hours if 6 <= hour <= 18: # Peak production around noon if 10 <= hour <= 14: base_production = 20 else: base_production = 10 # Higher production in summer if month in [5, 6, 7, 8, 9]: base_production *= 1.4 return base_production + random.uniform(-3, 5) else: return 0.0 elif energy_source == 'Wind': # Variable production throughout the day base_production = 12 # Higher production in winter and spring if month in [11, 12, 1, 2, 3, 4]: base_production *= 1.3 return base_production + random.uniform(-5, 8) else: # Electricity and Gas don't typically produce energy return 0.0 def _calculate_cost(self, consumption_kwh: float, tariff_plan: str, timestamp: datetime) -> float: """ Calculate energy cost based on consumption and tariff plan. Args: consumption_kwh: Energy consumption tariff_plan: Tariff plan type timestamp: Current timestamp Returns: Cost in USD """ hour = timestamp.hour if tariff_plan == 'Standard': return consumption_kwh * self.tariff_rates['Standard'] elif tariff_plan == 'Time-of-Use': # Peak hours: 7-9 AM, 5-8 PM if hour in [7, 8, 9, 17, 18, 19, 20]: rate = self.tariff_rates['Time-of-Use']['peak'] else: rate = self.tariff_rates['Time-of-Use']['off_peak'] return consumption_kwh * rate elif tariff_plan == 'Peak/Off-Peak': # Peak hours: 6-10 AM, 4-9 PM if hour in [6, 7, 8, 9, 10, 16, 17, 18, 19, 20, 21]: rate = self.tariff_rates['Peak/Off-Peak']['peak'] else: rate = self.tariff_rates['Peak/Off-Peak']['off_peak'] return consumption_kwh * rate return consumption_kwh * 0.12 # Default rate
[docs] def get_schema(self) -> Dict[str, str]: """ Return column schema with types. Returns: Dictionary mapping column names to their data types """ return { 'reading_id': 'string', 'timestamp': 'datetime', 'meter_id': 'string', 'location': 'string', 'energy_source': 'string', 'consumption_kwh': 'float', 'production_kwh': 'float', 'net_usage_kwh': 'float', 'cost_usd': 'float', 'tariff_plan': 'string', 'peak_demand_kw': 'float', 'outage_flag': 'boolean', 'outage_duration_min': 'integer', 'co2_emissions_kg': 'float' }