Source code for tempdataset.core.datasets.insurance

"""
Insurance dataset generator.

Generates realistic insurance policy, claims, and risk assessment data.
"""

import random
import string
from datetime import datetime, timedelta
from typing import List, Dict, Any, Optional

from .base import BaseDataset
from ..utils.faker_utils import get_faker_utils


[docs] class InsuranceDataset(BaseDataset): """ Insurance dataset generator that creates realistic insurance data. Generates 20 columns of insurance data including: - Policy information (ID, type, coverage, premiums) - Claims data (claim ID, amounts, status) - Risk assessment and agent information - Geographic and temporal data """ def __init__(self, rows: int = 500): """ Initialize the InsuranceDataset generator. Args: rows: Number of rows to generate (default: 500) """ super().__init__(rows) self.faker_utils = get_faker_utils() # Initialize data for consistent generation self._init_data_lists() # Counters for sequential IDs self._policy_counter = 10001 self._customer_counter = 1 self._claim_counter = 1001 self._agent_counter = 101 def _init_data_lists(self) -> None: """Initialize predefined data lists for realistic generation.""" # Policy types self.policy_types = ['Auto', 'Health', 'Property', 'Life', 'Business', 'Travel', 'Disability'] # Payment frequencies self.payment_frequencies = ['Monthly', 'Quarterly', 'Semi-Annual', 'Annual'] # Claim reasons by policy type self.claim_reasons = { 'Auto': ['Collision', 'Theft', 'Vandalism', 'Weather Damage', 'Hit and Run', 'Glass Damage'], 'Health': ['Emergency Room', 'Surgery', 'Prescription Drugs', 'Specialist Visit', 'Diagnostic Tests', 'Physical Therapy'], 'Property': ['Fire Damage', 'Water Damage', 'Theft', 'Storm Damage', 'Vandalism', 'Equipment Failure'], 'Life': ['Death Benefit', 'Terminal Illness', 'Accidental Death'], 'Business': ['Property Damage', 'Liability Claim', 'Business Interruption', 'Cyber Attack', 'Equipment Loss'], 'Travel': ['Trip Cancellation', 'Medical Emergency', 'Lost Luggage', 'Flight Delay', 'Emergency Evacuation'], 'Disability': ['Injury', 'Illness', 'Accident', 'Work-Related Disability'] } # Claim statuses self.claim_statuses = ['Pending', 'Under Review', 'Approved', 'Rejected', 'Paid', 'Partially Paid', 'Closed'] # Risk ratings self.risk_ratings = ['Low', 'Medium', 'High', 'Very High'] # Agent names self.agent_names = [ 'John Smith', 'Sarah Johnson', 'Michael Brown', 'Lisa Davis', 'Robert Wilson', 'Jennifer Garcia', 'David Miller', 'Amanda Taylor', 'Christopher Anderson', 'Michelle Martinez', 'Daniel Thompson', 'Jessica White', 'Matthew Jackson', 'Ashley Harris', 'James Martin', 'Emily Clark', 'Andrew Rodriguez', 'Stephanie Lewis', 'Joshua Walker', 'Nicole Hall' ] # Regions self.regions = ['Northeast', 'Southeast', 'Midwest', 'Southwest', 'West', 'Northwest', 'Central']
[docs] def generate(self) -> List[Dict[str, Any]]: """ Generate insurance dataset rows. Returns: List of dictionaries representing insurance data rows """ if self.seed is not None: random.seed(self.seed) self.faker_utils.set_seed(self.seed) data = [] for i in range(self.rows): row = self._generate_row() data.append(row) return data
def _generate_row(self) -> Dict[str, Any]: """Generate a single insurance data row.""" # Generate policy information policy_type = random.choice(self.policy_types) payment_frequency = random.choice(self.payment_frequencies) # Generate policy dates start_date = self.faker_utils.date_between( datetime.now() - timedelta(days=1095), # 3 years ago datetime.now() - timedelta(days=30) # 30 days ago ) # Policy duration varies by type duration_months = self._get_policy_duration(policy_type) end_date = start_date + timedelta(days=duration_months * 30) # Generate coverage and premium amounts coverage_amount = self._generate_coverage_amount(policy_type) premium_amount = self._generate_premium_amount(policy_type, coverage_amount) # Generate claim information (not all policies have claims) has_claim = random.random() < 0.3 # 30% of policies have claims claim_id = None claim_date = None claim_amount = None claim_reason = None claim_status = None if has_claim: claim_id = self._generate_claim_id() claim_date = self.faker_utils.date_between(start_date, min(end_date, datetime.now().date())) claim_amount = self._generate_claim_amount(policy_type, coverage_amount) claim_reason = random.choice(self.claim_reasons.get(policy_type, ['General Claim'])) claim_status = random.choice(self.claim_statuses) # Generate risk score risk_score = self._generate_risk_score(policy_type, has_claim) # Generate agent information agent_name = random.choice(self.agent_names) # Generate location information region = random.choice(self.regions) country = self.faker_utils.country() city = self.faker_utils.city() # Generate notes (occasionally) notes = None if random.random() < 0.2: note_options = [ 'Customer requested policy review', 'Premium discount applied', 'Multi-policy discount', 'Good driver discount', 'No claims bonus', 'Policy auto-renewed', 'Payment plan adjusted', 'Coverage increased', 'Deductible modified', 'Agent referral bonus' ] notes = random.choice(note_options) return { 'policy_id': self._generate_policy_id(), 'customer_id': self._generate_customer_id(), 'policy_type': policy_type, 'coverage_amount': coverage_amount, 'premium_amount': round(premium_amount, 2), 'payment_frequency': payment_frequency, 'start_date': start_date.strftime('%Y-%m-%d'), 'end_date': end_date.strftime('%Y-%m-%d'), 'claim_id': claim_id, 'claim_date': claim_date.strftime('%Y-%m-%d') if claim_date else None, 'claim_amount': round(claim_amount, 2) if claim_amount else None, 'claim_reason': claim_reason, 'claim_status': claim_status, 'risk_score': round(risk_score, 3), 'agent_id': self._generate_agent_id(), 'agent_name': agent_name, 'region': region, 'country': country, 'city': city, 'notes': notes } def _generate_policy_id(self) -> str: """Generate policy ID in format POL-NNNNNN.""" policy_num = str(self._policy_counter).zfill(6) self._policy_counter += 1 return f"POL-{policy_num}" def _generate_customer_id(self) -> str: """Generate customer ID in format CUST-NNNNNN.""" customer_num = str(self._customer_counter).zfill(6) self._customer_counter += 1 return f"CUST-{customer_num}" def _generate_claim_id(self) -> str: """Generate claim ID in format CLM-NNNNNN.""" claim_num = str(self._claim_counter).zfill(6) self._claim_counter += 1 return f"CLM-{claim_num}" def _generate_agent_id(self) -> str: """Generate agent ID in format AGT-NNN.""" agent_num = str(self._agent_counter).zfill(3) self._agent_counter += 1 return f"AGT-{agent_num}" def _get_policy_duration(self, policy_type: str) -> int: """ Get typical policy duration in months. Args: policy_type: Type of insurance policy Returns: Duration in months """ durations = { 'Auto': 12, 'Health': 12, 'Property': 12, 'Life': 240, # 20 years 'Business': 12, 'Travel': 1, 'Disability': 24 } return durations.get(policy_type, 12) def _generate_coverage_amount(self, policy_type: str) -> float: """ Generate realistic coverage amount based on policy type. Args: policy_type: Type of insurance policy Returns: Coverage amount """ coverage_ranges = { 'Auto': (25000, 500000), 'Health': (50000, 2000000), 'Property': (100000, 1000000), 'Life': (50000, 5000000), 'Business': (100000, 10000000), 'Travel': (1000, 100000), 'Disability': (25000, 500000) } min_coverage, max_coverage = coverage_ranges.get(policy_type, (10000, 100000)) return round(random.uniform(min_coverage, max_coverage), 2) def _generate_premium_amount(self, policy_type: str, coverage_amount: float) -> float: """ Generate realistic premium amount based on policy type and coverage. Args: policy_type: Type of insurance policy coverage_amount: Coverage amount Returns: Premium amount """ # Premium as percentage of coverage premium_rates = { 'Auto': (0.02, 0.08), # 2-8% of coverage 'Health': (0.05, 0.15), # 5-15% of coverage 'Property': (0.005, 0.02), # 0.5-2% of coverage 'Life': (0.01, 0.05), # 1-5% of coverage 'Business': (0.01, 0.08), # 1-8% of coverage 'Travel': (0.05, 0.2), # 5-20% of coverage 'Disability': (0.02, 0.1) # 2-10% of coverage } min_rate, max_rate = premium_rates.get(policy_type, (0.01, 0.05)) rate = random.uniform(min_rate, max_rate) return coverage_amount * rate def _generate_claim_amount(self, policy_type: str, coverage_amount: float) -> float: """ Generate realistic claim amount. Args: policy_type: Type of insurance policy coverage_amount: Coverage amount Returns: Claim amount """ # Claims are typically a fraction of coverage max_claim = coverage_amount * 0.8 # Max 80% of coverage min_claim = coverage_amount * 0.001 # Min 0.1% of coverage # Skew towards smaller claims claim_amount = random.uniform(min_claim, max_claim) # Apply power law distribution (most claims are small) claim_amount = claim_amount * (random.random() ** 2) return max(100, claim_amount) # Minimum claim of $100 def _generate_risk_score(self, policy_type: str, has_claim: bool) -> float: """ Generate risk score based on policy type and claim history. Args: policy_type: Type of insurance policy has_claim: Whether policy has claims Returns: Risk score between 0 and 1 """ base_risk = { 'Auto': 0.3, 'Health': 0.4, 'Property': 0.2, 'Life': 0.1, 'Business': 0.5, 'Travel': 0.6, 'Disability': 0.3 }.get(policy_type, 0.3) # Adjust risk based on claims if has_claim: base_risk += 0.2 # Add random variation risk_score = base_risk + random.uniform(-0.2, 0.2) # Ensure score is between 0 and 1 return max(0, min(1, risk_score))
[docs] def get_schema(self) -> Dict[str, str]: """ Return column schema with types. Returns: Dictionary mapping column names to their data types """ return { 'policy_id': 'string', 'customer_id': 'string', 'policy_type': 'string', 'coverage_amount': 'float', 'premium_amount': 'float', 'payment_frequency': 'string', 'start_date': 'date', 'end_date': 'date', 'claim_id': 'string', 'claim_date': 'date', 'claim_amount': 'float', 'claim_reason': 'string', 'claim_status': 'string', 'risk_score': 'float', 'agent_id': 'string', 'agent_name': 'string', 'region': 'string', 'country': 'string', 'city': 'string', 'notes': 'string' }