Source code for tempdataset.core.datasets.lab_results

"""
Lab results dataset generator.

Generates realistic laboratory test results.
"""

import random
from datetime import datetime, timedelta
from typing import List, Dict, Any, Tuple

from .base import BaseDataset
from ..utils.faker_utils import get_faker_utils


[docs] class LabResultsDataset(BaseDataset): """Lab results dataset generator for medical testing.""" def __init__(self, rows: int = 500): super().__init__(rows) self.faker_utils = get_faker_utils() self._init_data_lists() self._lab_counter = 1 def _init_data_lists(self) -> None: # Common lab tests with their codes, units, and reference ranges self.lab_tests = { 'CBC': { 'code': 'CBC001', 'tests': [ ('White Blood Cell Count', 'WBC', 'cells/μL', (4500, 11000)), ('Red Blood Cell Count', 'RBC', 'cells/μL', (4200000, 5400000)), ('Hemoglobin', 'HGB', 'g/dL', (12.0, 16.0)), ('Hematocrit', 'HCT', '%', (36, 46)), ('Platelet Count', 'PLT', 'cells/μL', (150000, 450000)) ] }, 'Lipid Panel': { 'code': 'LIPID001', 'tests': [ ('Total Cholesterol', 'CHOL', 'mg/dL', (125, 200)), ('HDL Cholesterol', 'HDL', 'mg/dL', (40, 60)), ('LDL Cholesterol', 'LDL', 'mg/dL', (70, 130)), ('Triglycerides', 'TRIG', 'mg/dL', (35, 150)) ] }, 'Basic Metabolic Panel': { 'code': 'BMP001', 'tests': [ ('Glucose', 'GLU', 'mg/dL', (70, 100)), ('Sodium', 'NA', 'mEq/L', (136, 145)), ('Potassium', 'K', 'mEq/L', (3.5, 5.1)), ('Chloride', 'CL', 'mEq/L', (98, 107)), ('BUN', 'BUN', 'mg/dL', (7, 20)), ('Creatinine', 'CREAT', 'mg/dL', (0.6, 1.3)) ] }, 'Liver Function Tests': { 'code': 'LFT001', 'tests': [ ('ALT', 'ALT', 'U/L', (7, 40)), ('AST', 'AST', 'U/L', (10, 40)), ('Bilirubin Total', 'TBIL', 'mg/dL', (0.3, 1.2)), ('Alkaline Phosphatase', 'ALP', 'U/L', (44, 147)) ] }, 'Thyroid Function': { 'code': 'THYROID001', 'tests': [ ('TSH', 'TSH', 'mIU/L', (0.4, 4.0)), ('T4 Free', 'T4F', 'ng/dL', (0.8, 1.8)), ('T3 Total', 'T3T', 'ng/dL', (80, 200)) ] }, 'Urinalysis': { 'code': 'UA001', 'tests': [ ('Specific Gravity', 'SPGR', '', (1.003, 1.030)), ('Protein', 'PROT', 'mg/dL', (0, 20)), ('Glucose', 'UGLU', 'mg/dL', (0, 15)), ('Ketones', 'KET', 'mg/dL', (0, 5)) ] } } self.flags = ['Normal', 'High', 'Low', 'Critical']
[docs] def generate(self) -> List[Dict[str, Any]]: if self.seed is not None: random.seed(self.seed) self.faker_utils.set_seed(self.seed) return [self._generate_row() for _ in range(self.rows)]
def _generate_row(self) -> Dict[str, Any]: # Basic lab result info lab_result_id = f"LAB-2025-{self._lab_counter:06d}" self._lab_counter += 1 patient_id = f"PAT-2025-{random.randint(1, 999999):06d}" ordering_physician_id = f"PHY-{random.randint(1000, 9999)}" lab_technician_id = f"TECH-{random.randint(100, 999)}" # Select random test panel test_panel = random.choice(list(self.lab_tests.keys())) panel_info = self.lab_tests[test_panel] # Select specific test from the panel test_info = random.choice(panel_info['tests']) test_name, test_code, unit, ref_range = test_info # Generate collection and result dates collection_date = self.faker_utils.date_between( datetime.now() - timedelta(days=30), datetime.now() ) # Add realistic times collection_hour = random.randint(6, 16) # Collection typically during day collection_minute = random.choice([0, 15, 30, 45]) collection_datetime = datetime.combine(collection_date, datetime.min.time()).replace( hour=collection_hour, minute=collection_minute, second=0 ) # Result date is typically 1-3 days after collection result_delay_hours = random.randint(4, 72) # 4 hours to 3 days result_datetime = collection_datetime + timedelta(hours=result_delay_hours) # Generate test result value based on reference range min_ref, max_ref = ref_range # Determine if result is normal, high, or low result_type = random.choices( ['normal', 'high', 'low', 'critical'], weights=[0.70, 0.15, 0.13, 0.02] # Most results are normal )[0] if result_type == 'normal': result_value = round(random.uniform(min_ref, max_ref), 2) flag = 'Normal' elif result_type == 'high': # High values are 10-50% above normal range multiplier = random.uniform(1.1, 1.5) result_value = round(max_ref * multiplier, 2) flag = 'High' elif result_type == 'low': # Low values are 10-50% below normal range multiplier = random.uniform(0.5, 0.9) result_value = round(min_ref * multiplier, 2) flag = 'Low' else: # critical # Critical values are significantly outside normal range if random.choice([True, False]): # Critically high multiplier = random.uniform(2.0, 4.0) result_value = round(max_ref * multiplier, 2) else: # Critically low multiplier = random.uniform(0.1, 0.4) result_value = round(min_ref * multiplier, 2) flag = 'Critical' # For certain tests, use integer values if test_code in ['WBC', 'RBC', 'PLT']: result_value = int(result_value) # Reference range string if isinstance(min_ref, int) and isinstance(max_ref, int): reference_range = f"{min_ref}-{max_ref}" else: reference_range = f"{min_ref:.1f}-{max_ref:.1f}" # Notes (optional) notes_options = [ 'Sample hemolyzed', 'Fasting specimen', 'Non-fasting specimen', 'Sample collected properly', 'Repeated due to interference', 'Critical value called to physician', None ] notes = random.choice(notes_options) if random.random() < 0.3 else None return { 'lab_result_id': lab_result_id, 'patient_id': patient_id, 'test_name': test_name, 'test_code': test_code, 'collection_date': collection_datetime.strftime('%Y-%m-%d %H:%M:%S'), 'result_date': result_datetime.strftime('%Y-%m-%d %H:%M:%S'), 'result_value': str(result_value), # Store as string to handle both int and float 'unit': unit, 'reference_range': reference_range, 'flag': flag, 'ordering_physician_id': ordering_physician_id, 'lab_technician_id': lab_technician_id, 'notes': notes }
[docs] def get_schema(self) -> Dict[str, str]: return { 'lab_result_id': 'string', 'patient_id': 'string', 'test_name': 'string', 'test_code': 'string', 'collection_date': 'datetime', 'result_date': 'datetime', 'result_value': 'string', 'unit': 'string', 'reference_range': 'string', 'flag': 'string', 'ordering_physician_id': 'string', 'lab_technician_id': 'string', 'notes': 'string' }