Source code for tempdataset.core.datasets.crm

"""
CRM dataset generator.

Generates realistic customer relationship management data with 30+ columns including
customer interactions, sales pipeline, account history, support cases, and loyalty data.
"""

import random
import string
from datetime import datetime, timedelta
from typing import List, Dict, Any

from .base import BaseDataset
from ..utils.faker_utils import get_faker_utils


[docs] class CrmDataset(BaseDataset): """ CRM dataset generator that creates realistic customer relationship management data. Generates 30+ columns of CRM data including: - Customer information (customer_id, name, email, company, demographics) - Account management (account_manager, creation_date, status) - Interactions (channel, notes, contact dates) - Sales pipeline (stage, deal value, probability) - Support data (tickets, satisfaction ratings) - Geographic and preference data """ def __init__(self, rows: int = 500): """ Initialize the CrmDataset generator. Args: rows: Number of rows to generate (default: 500) """ super().__init__(rows) self.faker_utils = get_faker_utils() # Initialize data for consistent generation self._init_data_lists() # Counter for sequential customer IDs self._customer_counter = 1 def _init_data_lists(self) -> None: """Initialize predefined data lists for realistic generation.""" # Industries self.industries = [ 'Technology', 'Healthcare', 'Finance', 'Manufacturing', 'Retail', 'Education', 'Real Estate', 'Automotive', 'Energy', 'Telecommunications', 'Media', 'Transportation', 'Hospitality', 'Construction', 'Agriculture', 'Pharmaceuticals', 'Aerospace', 'Food & Beverage', 'Legal', 'Consulting' ] # Account managers self.account_managers = [ 'Sarah Johnson', 'Mike Davis', 'Lisa Wilson', 'David Brown', 'Jennifer Garcia', 'Robert Miller', 'Amanda Taylor', 'Chris Anderson', 'Michelle White', 'Kevin Lee', 'Rachel Green', 'Tom Wilson', 'Emily Chen', 'Mark Thompson', 'Jessica Rodriguez' ] # Interaction channels self.interaction_channels = ['Email', 'Phone', 'Meeting', 'Chat', 'Video Call', 'Social Media'] # Sales stages self.sales_stages = ['Lead', 'Prospect', 'Negotiation', 'Closed Won', 'Closed Lost'] # Loyalty status levels self.loyalty_statuses = ['Bronze', 'Silver', 'Gold', 'Platinum'] # Referral sources self.referral_sources = [ 'Online Ad', 'Friend', 'Trade Show', 'Website', 'Social Media', 'Email Campaign', 'Cold Call', 'Partner Referral', 'Search Engine', 'Direct Mail' ] # Preferred contact times self.contact_times = ['Morning', 'Afternoon', 'Evening'] # Account statuses self.account_statuses = ['Active', 'Inactive', 'Suspended'] # Regions self.regions = ['North America', 'Europe', 'Asia Pacific', 'Latin America', 'Middle East & Africa'] # Sample interaction notes self.interaction_notes_templates = [ 'Discussed product features and pricing options', 'Follow-up call scheduled for next week', 'Customer expressed interest in enterprise solution', 'Provided demo of new features', 'Addressed technical questions about integration', 'Negotiating contract terms and conditions', 'Customer requested additional references', 'Discussed implementation timeline', 'Reviewed proposal and next steps', 'Customer needs approval from management' ] # Sample notes self.notes_templates = [ 'High-value customer with strong growth potential', 'Requires regular check-ins and support', 'Decision maker for technology purchases', 'Price-sensitive but loyal customer', 'Interested in long-term partnership', 'Prefers email communication over phone', 'Has specific compliance requirements', 'Looking to expand to new markets', 'Seasonal business with peak in Q4', 'Strong advocate for our products' ]
[docs] def generate(self) -> List[Dict[str, Any]]: """ Generate CRM dataset rows. Returns: List of dictionaries representing CRM customer rows """ if self.seed is not None: random.seed(self.seed) self.faker_utils.set_seed(self.seed) data = [] for i in range(self.rows): row = self._generate_row() data.append(row) return data
def _generate_row(self) -> Dict[str, Any]: """Generate a single CRM customer row.""" # Generate customer information first_name = self.faker_utils.first_name() last_name = self.faker_utils.last_name() customer_name = f"{first_name} {last_name}" email = self.faker_utils.email(customer_name) phone_number = self.faker_utils.phone_number() company = self.faker_utils.company() industry = random.choice(self.industries) # Generate account information account_manager = random.choice(self.account_managers) account_creation_date = self._generate_account_creation_date() last_contact_date = self._generate_last_contact_date(account_creation_date) # Generate interaction data interaction_channel = random.choice(self.interaction_channels) interaction_notes = random.choice(self.interaction_notes_templates) # Generate sales pipeline data sales_stage = random.choice(self.sales_stages) deal_value = self._generate_deal_value(sales_stage) deal_probability = self._generate_deal_probability(sales_stage) # Generate loyalty and purchase data loyalty_status = random.choice(self.loyalty_statuses) total_orders = self._generate_total_orders(loyalty_status) total_spent = self._generate_total_spent(total_orders, loyalty_status) # Generate support data support_tickets = random.randint(0, 20) support_satisfaction = random.randint(1, 5) if support_tickets > 0 else None # Generate preferences and settings preferred_contact_time = random.choice(self.contact_times) newsletter_subscribed = random.choice([True, False]) referral_source = random.choice(self.referral_sources) # Generate geographic data region = random.choice(self.regions) country = self.faker_utils.country() state = self.faker_utils.state() city = self.faker_utils.city() postal_code = self.faker_utils.postal_code() # Generate additional data notes = random.choice(self.notes_templates) account_status = random.choice(self.account_statuses) return { 'customer_id': self._generate_customer_id(), 'first_name': first_name, 'last_name': last_name, 'email': email, 'phone_number': phone_number, 'company': company, 'industry': industry, 'account_manager': account_manager, 'account_creation_date': account_creation_date.strftime('%Y-%m-%d'), 'last_contact_date': last_contact_date.strftime('%Y-%m-%d'), 'interaction_channel': interaction_channel, 'interaction_notes': interaction_notes, 'sales_stage': sales_stage, 'deal_value': round(deal_value, 2), 'deal_probability': deal_probability, 'loyalty_status': loyalty_status, 'total_orders': total_orders, 'total_spent': round(total_spent, 2), 'support_tickets': support_tickets, 'support_satisfaction': support_satisfaction, 'preferred_contact_time': preferred_contact_time, 'newsletter_subscribed': newsletter_subscribed, 'referral_source': referral_source, 'region': region, 'country': country, 'state': state, 'city': city, 'postal_code': postal_code, 'notes': notes, 'account_status': account_status } def _generate_customer_id(self) -> str: """ Generate customer ID in format "CUST-NNNNNN". Returns: Formatted customer ID """ customer_num = str(self._customer_counter).zfill(6) self._customer_counter += 1 return f"CUST-{customer_num}" def _generate_account_creation_date(self) -> datetime: """ Generate account creation date (within last 3 years). Returns: Account creation datetime """ end_date = datetime.now() start_date = end_date - timedelta(days=1095) # 3 years return self.faker_utils.date_between(start_date, end_date) def _generate_last_contact_date(self, creation_date: datetime) -> datetime: """ Generate last contact date (between creation date and now). Args: creation_date: Account creation date Returns: Last contact datetime """ end_date = datetime.now() return self.faker_utils.date_between(creation_date, end_date) def _generate_deal_value(self, sales_stage: str) -> float: """ Generate deal value based on sales stage. Args: sales_stage: Current sales stage Returns: Deal value as float """ base_ranges = { 'Lead': (1000, 25000), 'Prospect': (5000, 50000), 'Negotiation': (10000, 100000), 'Closed Won': (15000, 150000), 'Closed Lost': (2000, 30000) } min_val, max_val = base_ranges.get(sales_stage, (5000, 50000)) return random.uniform(min_val, max_val) def _generate_deal_probability(self, sales_stage: str) -> float: """ Generate deal probability based on sales stage. Args: sales_stage: Current sales stage Returns: Deal probability as percentage (0-100) """ probability_ranges = { 'Lead': (5, 25), 'Prospect': (25, 50), 'Negotiation': (50, 85), 'Closed Won': (100, 100), 'Closed Lost': (0, 0) } min_prob, max_prob = probability_ranges.get(sales_stage, (25, 75)) return random.uniform(min_prob, max_prob) def _generate_total_orders(self, loyalty_status: str) -> int: """ Generate total orders based on loyalty status. Args: loyalty_status: Customer loyalty level Returns: Total number of orders """ order_ranges = { 'Bronze': (1, 5), 'Silver': (6, 15), 'Gold': (16, 35), 'Platinum': (36, 100) } min_orders, max_orders = order_ranges.get(loyalty_status, (1, 10)) return random.randint(min_orders, max_orders) def _generate_total_spent(self, total_orders: int, loyalty_status: str) -> float: """ Generate total spent based on orders and loyalty status. Args: total_orders: Number of orders loyalty_status: Customer loyalty level Returns: Total amount spent """ avg_order_ranges = { 'Bronze': (50, 200), 'Silver': (200, 500), 'Gold': (500, 1500), 'Platinum': (1500, 5000) } min_avg, max_avg = avg_order_ranges.get(loyalty_status, (100, 500)) avg_order_value = random.uniform(min_avg, max_avg) return total_orders * avg_order_value * random.uniform(0.8, 1.2) # Add some variance
[docs] def get_schema(self) -> Dict[str, str]: """ Return column schema with types. Returns: Dictionary mapping column names to their data types """ return { 'customer_id': 'string', 'first_name': 'string', 'last_name': 'string', 'email': 'string', 'phone_number': 'string', 'company': 'string', 'industry': 'string', 'account_manager': 'string', 'account_creation_date': 'date', 'last_contact_date': 'date', 'interaction_channel': 'string', 'interaction_notes': 'string', 'sales_stage': 'string', 'deal_value': 'float', 'deal_probability': 'float', 'loyalty_status': 'string', 'total_orders': 'integer', 'total_spent': 'float', 'support_tickets': 'integer', 'support_satisfaction': 'integer', 'preferred_contact_time': 'string', 'newsletter_subscribed': 'boolean', 'referral_source': 'string', 'region': 'string', 'country': 'string', 'state': 'string', 'city': 'string', 'postal_code': 'string', 'notes': 'string', 'account_status': 'string' }