Source code for tempdataset.core.datasets.employees

"""
Employees dataset generator.

Generates realistic employee data with comprehensive HR information including
personal details, employment information, compensation, and performance metrics.
"""

import random
import string
from datetime import datetime, timedelta
from typing import List, Dict, Any, Optional

from .base import BaseDataset
from ..utils.faker_utils import get_faker_utils



[docs]
class EmployeesDataset(BaseDataset):
    """
    Employees dataset generator that creates realistic employee HR data.
    
    Generates comprehensive employee data including:
    - Personal information (names, demographics, contact details)
    - Employment details (hire dates, departments, job titles)
    - Compensation (salary, bonus, total compensation)
    - Performance metrics (scores, reviews, training)
    - Work arrangements (location, status, projects)
    - Management hierarchy (manager relationships)
    """
    
    def __init__(self, rows: int = 500):
        """
        Initialize the EmployeesDataset generator.
        
        Args:
            rows: Number of rows to generate (default: 500)
        """
        super().__init__(rows)
        self.faker_utils = get_faker_utils()
        
        # Initialize data for consistent generation
        self._init_data_lists()
        
        # Counter for sequential employee IDs
        self._employee_counter = 1
        
        # Store generated employees for manager relationships
        self._generated_employees = []
    
    def _init_data_lists(self) -> None:
        """Initialize predefined data lists for realistic generation."""
        
        # Departments and their typical job titles
        self.departments = {
            'HR': ['HR Manager', 'HR Specialist', 'Recruiter', 'HR Coordinator', 'Benefits Administrator'],
            'Sales': ['Sales Manager', 'Sales Representative', 'Account Executive', 'Sales Coordinator', 'Business Development Manager'],
            'Marketing': ['Marketing Manager', 'Marketing Specialist', 'Content Creator', 'Digital Marketing Manager', 'Brand Manager'],
            'IT': ['Software Engineer', 'DevOps Engineer', 'Data Analyst', 'IT Manager', 'System Administrator', 'QA Engineer'],
            'Finance': ['Financial Analyst', 'Accountant', 'Finance Manager', 'Controller', 'Budget Analyst'],
            'Operations': ['Operations Manager', 'Operations Coordinator', 'Process Analyst', 'Supply Chain Manager', 'Logistics Coordinator'],
            'R&D': ['Research Scientist', 'Product Manager', 'R&D Engineer', 'Innovation Manager', 'Technical Lead'],
            'Customer Support': ['Customer Support Representative', 'Support Manager', 'Technical Support Specialist', 'Customer Success Manager']
        }
        
        # Employment types
        self.employment_types = ['Full-time', 'Part-time', 'Contract', 'Internship']
        
        # Gender options
        self.genders = ['Male', 'Female', 'Non-binary', 'Prefer not to say']
        
        # Work locations
        self.work_locations = ['Onsite', 'Remote', 'Hybrid']
        
        # Office locations (cities)
        self.office_locations = [
            'New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'Philadelphia',
            'San Antonio', 'San Diego', 'Dallas', 'San Jose', 'Austin', 'Jacksonville',
            'Seattle', 'Denver', 'Boston', 'Nashville', 'Detroit', 'Portland',
            'Las Vegas', 'Memphis', 'Louisville', 'Baltimore', 'Milwaukee', 'Atlanta'
        ]
        
        # Employee statuses
        self.employee_statuses = ['Active', 'On Leave', 'Terminated', 'Retired']
        
        # Common skills by department
        self.skills_by_department = {
            'HR': ['Recruitment', 'Employee Relations', 'Performance Management', 'Benefits Administration', 'Training & Development'],
            'Sales': ['Lead Generation', 'CRM', 'Negotiation', 'Customer Relationship Management', 'Sales Forecasting'],
            'Marketing': ['Digital Marketing', 'Content Creation', 'SEO', 'Social Media', 'Brand Management', 'Analytics'],
            'IT': ['Python', 'Java', 'JavaScript', 'SQL', 'Cloud Computing', 'DevOps', 'Data Analysis', 'Cybersecurity'],
            'Finance': ['Financial Analysis', 'Budgeting', 'Forecasting', 'Excel', 'Financial Reporting', 'Risk Management'],
            'Operations': ['Process Improvement', 'Project Management', 'Supply Chain', 'Logistics', 'Quality Assurance'],
            'R&D': ['Research', 'Product Development', 'Innovation', 'Technical Writing', 'Data Analysis', 'Project Management'],
            'Customer Support': ['Customer Service', 'Technical Support', 'Problem Solving', 'Communication', 'CRM']
        }
        
        # Common certifications by department
        self.certifications_by_department = {
            'HR': ['PHR', 'SHRM-CP', 'SHRM-SCP', 'HRCI'],
            'Sales': ['Salesforce Certified', 'HubSpot Certified', 'Google Analytics'],
            'Marketing': ['Google Ads Certified', 'HubSpot Marketing', 'Facebook Blueprint', 'Google Analytics'],
            'IT': ['AWS Solutions Architect', 'Microsoft Azure', 'Cisco CCNA', 'CompTIA Security+', 'PMP'],
            'Finance': ['CPA', 'CFA', 'FRM', 'CMA'],
            'Operations': ['PMP', 'Six Sigma', 'Lean Manufacturing', 'APICS'],
            'R&D': ['PMP', 'Agile Certified', 'Six Sigma'],
            'Customer Support': ['ITIL', 'Customer Service Certification', 'Technical Support Certification']
        }
        
        # Project names
        self.project_names = [
            'Digital Transformation Initiative', 'Customer Experience Enhancement', 'Product Launch 2024',
            'Process Optimization', 'Market Expansion', 'Technology Upgrade', 'Quality Improvement',
            'Cost Reduction Program', 'Innovation Lab', 'Sustainability Initiative', 'Data Analytics Platform',
            'Mobile App Development', 'Cloud Migration', 'Security Enhancement', 'Training Program'
        ]
    

[docs]
    def generate(self) -> List[Dict[str, Any]]:
        """
        Generate employees dataset rows.
        
        Returns:
            List of dictionaries representing employee records
        """
        if self.seed is not None:
            random.seed(self.seed)
            self.faker_utils.set_seed(self.seed)
        
        data = []
        
        # First pass: generate all employees without manager relationships
        for i in range(self.rows):
            row = self._generate_row()
            data.append(row)
            self._generated_employees.append({
                'employee_id': row['employee_id'],
                'full_name': row['full_name'],
                'department': row['department'],
                'job_title': row['job_title']
            })
        
        # Second pass: assign managers (about 80% of employees have managers)
        self._assign_managers(data)
        
        return data

    
    def _generate_row(self) -> Dict[str, Any]:
        """Generate a single employee record."""
        
        # Generate personal information
        first_name = self.faker_utils.first_name()
        last_name = self.faker_utils.last_name()
        full_name = f"{first_name} {last_name}"
        
        # Generate birth date (employees aged 22-65)
        today = datetime.now()
        min_birth_date = today - timedelta(days=65*365)
        max_birth_date = today - timedelta(days=22*365)
        date_of_birth = self.faker_utils.date_between(min_birth_date, max_birth_date)
        
        # Convert date_of_birth to datetime for calculation if it's a date object
        if hasattr(date_of_birth, 'date'):
            # It's already a datetime
            birth_datetime = date_of_birth
        else:
            # It's a date, convert to datetime
            birth_datetime = datetime.combine(date_of_birth, datetime.min.time())
        
        age = int((today - birth_datetime).days / 365.25)
        
        # Generate contact information
        email = self.faker_utils.email(full_name)
        phone_number = self.faker_utils.phone_number()
        address = self.faker_utils.address().replace('\n', ', ')
        city = self.faker_utils.city()
        state_province = self.faker_utils.state()
        country = self.faker_utils.country()
        postal_code = self.faker_utils.postal_code()
        
        # Generate employment information
        department = random.choice(list(self.departments.keys()))
        job_title = random.choice(self.departments[department])
        employment_type = random.choice(self.employment_types)
        
        # Generate hire date (within last 10 years, but not future)
        max_hire_date = min(today, today - timedelta(days=30))  # At least 30 days ago
        min_hire_date = today - timedelta(days=10*365)
        hire_date = self.faker_utils.date_between(min_hire_date, max_hire_date)
        
        # Generate termination date (20% chance of being terminated)
        termination_date = None
        employee_status = 'Active'
        if random.random() < 0.20:
            # Terminated employees
            # Convert hire_date to datetime for calculation if it's a date object
            if hasattr(hire_date, 'date'):
                hire_datetime_for_term = hire_date
            else:
                hire_datetime_for_term = datetime.combine(hire_date, datetime.min.time())
            
            min_term_date = hire_datetime_for_term + timedelta(days=30)  # At least 30 days after hire
            max_term_date = today
            if min_term_date < max_term_date:
                termination_date = self.faker_utils.date_between(min_term_date, max_term_date)
                employee_status = random.choice(['Terminated', 'Retired'])
        elif random.random() < 0.05:
            # 5% on leave
            employee_status = 'On Leave'
        
        # Calculate years with company
        end_date = termination_date if termination_date else today
        
        # Convert hire_date to datetime for calculation if it's a date object
        if hasattr(hire_date, 'date'):
            # It's already a datetime
            hire_datetime = hire_date
        else:
            # It's a date, convert to datetime
            hire_datetime = datetime.combine(hire_date, datetime.min.time())
        
        # Convert end_date to datetime for calculation if it's a date object
        if hasattr(end_date, 'date'):
            # It's already a datetime
            end_datetime = end_date
        else:
            # It's a date, convert to datetime
            end_datetime = datetime.combine(end_date, datetime.min.time())
        
        years_with_company = round((end_datetime - hire_datetime).days / 365.25, 1)
        
        # Generate compensation based on department and seniority
        salary_usd = self._generate_salary(department, job_title, years_with_company)
        bonus_usd = round(salary_usd * random.uniform(0.05, 0.20), 2)  # 5-20% bonus
        total_compensation_usd = salary_usd + bonus_usd
        
        # Generate performance metrics
        performance_score = random.randint(1, 5)
        
        # Generate last performance review date
        if employee_status == 'Active':
            # Convert hire_date to datetime for comparison
            if hasattr(hire_date, 'date'):
                hire_datetime_for_review = hire_date
            else:
                hire_datetime_for_review = datetime.combine(hire_date, datetime.min.time())
            
            review_start = max(hire_datetime_for_review, today - timedelta(days=365))
            last_performance_review_date = self.faker_utils.date_between(review_start, today)
        else:
            # For terminated employees, review before termination
            review_end = termination_date if termination_date else today
            
            # Convert dates to datetime for comparison
            if hasattr(hire_date, 'date'):
                hire_datetime_for_review = hire_date
            else:
                hire_datetime_for_review = datetime.combine(hire_date, datetime.min.time())
            
            if hasattr(review_end, 'date'):
                review_end_datetime = review_end
            else:
                review_end_datetime = datetime.combine(review_end, datetime.min.time())
            
            review_start = max(hire_datetime_for_review, review_end_datetime - timedelta(days=365))
            if review_start < review_end_datetime:
                last_performance_review_date = self.faker_utils.date_between(review_start, review_end_datetime)
            else:
                last_performance_review_date = hire_date
        
        # Generate training and skills
        training_hours = round(random.uniform(0, 80), 1)  # 0-80 hours per year
        skills = self._generate_skills(department)
        certifications = self._generate_certifications(department)
        
        # Generate project information
        projects_count = random.randint(0, 8)
        current_project = random.choice(self.project_names) if employee_status == 'Active' and random.random() < 0.7 else None
        
        # Generate work arrangement
        leave_balance_days = round(random.uniform(0, 25), 1)  # 0-25 days
        work_location = random.choice(self.work_locations)
        office_location = random.choice(self.office_locations)
        
        return {
            'employee_id': self._generate_employee_id(),
            'first_name': first_name,
            'last_name': last_name,
            'full_name': full_name,
            'gender': random.choice(self.genders),
            'date_of_birth': date_of_birth.strftime('%Y-%m-%d'),
            'age': age,
            'email': email,
            'phone_number': phone_number,
            'address': address,
            'city': city,
            'state_province': state_province,
            'country': country,
            'postal_code': postal_code,
            'department': department,
            'job_title': job_title,
            'employment_type': employment_type,
            'hire_date': hire_date.strftime('%Y-%m-%d'),
            'termination_date': termination_date.strftime('%Y-%m-%d') if termination_date else None,
            'years_with_company': years_with_company,
            'manager_id': None,  # Will be assigned in second pass
            'manager_name': None,  # Will be assigned in second pass
            'salary_usd': salary_usd,
            'bonus_usd': bonus_usd,
            'total_compensation_usd': total_compensation_usd,
            'performance_score': performance_score,
            'last_performance_review_date': last_performance_review_date.strftime('%Y-%m-%d'),
            'training_hours': training_hours,
            'skills': skills,
            'certifications': certifications,
            'projects_count': projects_count,
            'current_project': current_project,
            'leave_balance_days': leave_balance_days,
            'work_location': work_location,
            'office_location': office_location,
            'employee_status': employee_status
        }
    
    def _generate_employee_id(self) -> str:
        """
        Generate employee ID in format "EMP-NNNNNN".
        
        Returns:
            Formatted employee ID
        """
        employee_num = str(self._employee_counter).zfill(6)
        self._employee_counter += 1
        return f"EMP-{employee_num}"
    
    def _generate_salary(self, department: str, job_title: str, years_with_company: float) -> float:
        """
        Generate realistic salary based on department, role, and experience.
        
        Args:
            department: Employee's department
            job_title: Employee's job title
            years_with_company: Years of experience
            
        Returns:
            Annual salary in USD
        """
        # Base salary ranges by department
        base_ranges = {
            'HR': (45000, 85000),
            'Sales': (40000, 120000),
            'Marketing': (45000, 95000),
            'IT': (60000, 140000),
            'Finance': (50000, 110000),
            'Operations': (45000, 90000),
            'R&D': (65000, 130000),
            'Customer Support': (35000, 70000)
        }
        
        # Manager roles get higher salaries
        is_manager = 'Manager' in job_title or 'Lead' in job_title
        
        min_salary, max_salary = base_ranges.get(department, (40000, 80000))
        
        if is_manager:
            min_salary = int(min_salary * 1.3)
            max_salary = int(max_salary * 1.5)
        
        # Base salary
        base_salary = random.uniform(min_salary, max_salary)
        
        # Experience multiplier (up to 50% increase for 10+ years)
        experience_multiplier = 1 + min(years_with_company * 0.05, 0.5)
        
        final_salary = base_salary * experience_multiplier
        
        return round(final_salary, 2)
    
    def _generate_skills(self, department: str) -> str:
        """
        Generate comma-separated skills based on department.
        
        Args:
            department: Employee's department
            
        Returns:
            Comma-separated string of skills
        """
        dept_skills = self.skills_by_department.get(department, ['Communication', 'Teamwork', 'Problem Solving'])
        
        # Select 3-6 skills
        num_skills = random.randint(3, 6)
        selected_skills = random.sample(dept_skills, min(num_skills, len(dept_skills)))
        
        # Add some general skills
        general_skills = ['Communication', 'Teamwork', 'Problem Solving', 'Time Management', 'Leadership']
        if random.random() < 0.3:  # 30% chance to add a general skill
            general_skill = random.choice(general_skills)
            if general_skill not in selected_skills:
                selected_skills.append(general_skill)
        
        return ', '.join(selected_skills)
    
    def _generate_certifications(self, department: str) -> Optional[str]:
        """
        Generate certifications based on department (60% chance of having certifications).
        
        Args:
            department: Employee's department
            
        Returns:
            Comma-separated string of certifications or None
        """
        if random.random() > 0.6:  # 40% chance of no certifications
            return None
        
        dept_certs = self.certifications_by_department.get(department, [])
        if not dept_certs:
            return None
        
        # Select 1-3 certifications
        num_certs = random.randint(1, min(3, len(dept_certs)))
        selected_certs = random.sample(dept_certs, num_certs)
        
        return ', '.join(selected_certs)
    
    def _assign_managers(self, data: List[Dict[str, Any]]) -> None:
        """
        Assign managers to employees in a second pass.
        
        Args:
            data: List of employee records to update with manager information
        """
        # Group employees by department
        dept_employees = {}
        for i, employee in enumerate(data):
            dept = employee['department']
            if dept not in dept_employees:
                dept_employees[dept] = []
            dept_employees[dept].append((i, employee))
        
        # For each department, assign managers
        for dept, employees in dept_employees.items():
            if len(employees) <= 1:
                continue  # Skip departments with only one employee
            
            # Find potential managers (those with manager titles or senior roles)
            managers = []
            non_managers = []
            
            for idx, emp in employees:
                if ('Manager' in emp['job_title'] or 'Lead' in emp['job_title'] or 
                    emp['years_with_company'] >= 3):
                    managers.append((idx, emp))
                else:
                    non_managers.append((idx, emp))
            
            # If no managers found, promote the most senior employee
            if not managers and employees:
                # Sort by years with company, descending
                sorted_employees = sorted(employees, key=lambda x: x[1]['years_with_company'], reverse=True)
                managers = [sorted_employees[0]]
                non_managers = sorted_employees[1:]
            
            # Assign managers to non-managers (80% chance)
            for idx, emp in non_managers:
                if random.random() < 0.8 and managers:  # 80% chance of having a manager
                    manager_idx, manager = random.choice(managers)
                    data[idx]['manager_id'] = manager['employee_id']
                    data[idx]['manager_name'] = manager['full_name']
    

[docs]
    def get_schema(self) -> Dict[str, str]:
        """
        Return column schema with types.
        
        Returns:
            Dictionary mapping column names to their data types
        """
        return {
            'employee_id': 'string',
            'first_name': 'string',
            'last_name': 'string',
            'full_name': 'string',
            'gender': 'string',
            'date_of_birth': 'date',
            'age': 'integer',
            'email': 'string',
            'phone_number': 'string',
            'address': 'string',
            'city': 'string',
            'state_province': 'string',
            'country': 'string',
            'postal_code': 'string',
            'department': 'string',
            'job_title': 'string',
            'employment_type': 'string',
            'hire_date': 'date',
            'termination_date': 'date',
            'years_with_company': 'float',
            'manager_id': 'string',
            'manager_name': 'string',
            'salary_usd': 'float',
            'bonus_usd': 'float',
            'total_compensation_usd': 'float',
            'performance_score': 'integer',
            'last_performance_review_date': 'date',
            'training_hours': 'float',
            'skills': 'string',
            'certifications': 'string',
            'projects_count': 'integer',
            'current_project': 'string',
            'leave_balance_days': 'float',
            'work_location': 'string',
            'office_location': 'string',
            'employee_status': 'string'
        }