Source code for tempdataset.core.datasets.clinical_trials

"""
Clinical trials dataset generator.

Generates realistic clinical trial records.
"""

import random
from datetime import datetime, timedelta
from typing import List, Dict, Any

from .base import BaseDataset
from ..utils.faker_utils import get_faker_utils


[docs] class ClinicalTrialsDataset(BaseDataset): """Clinical trials dataset generator for research studies.""" def __init__(self, rows: int = 500): super().__init__(rows) self.faker_utils = get_faker_utils() self._init_data_lists() self._trial_counter = 1 def _init_data_lists(self) -> None: self.trial_phases = ['Phase I', 'Phase II', 'Phase III', 'Phase IV'] self.trial_statuses = ['Recruiting', 'Active', 'Completed', 'Terminated', 'Suspended'] # Medical conditions commonly studied in clinical trials self.conditions_studied = [ 'Cancer', 'Alzheimer\'s Disease', 'Diabetes', 'Cardiovascular Disease', 'Depression', 'Rheumatoid Arthritis', 'Multiple Sclerosis', 'Parkinson\'s Disease', 'Asthma', 'COPD', 'Hypertension', 'Obesity', 'Chronic Pain', 'HIV/AIDS', 'Hepatitis C', 'Kidney Disease', 'Osteoporosis', 'Psoriasis', 'Migraine', 'Sleep Disorders', 'Anxiety Disorders', 'Bipolar Disorder', 'Schizophrenia', 'Inflammatory Bowel Disease', 'Lupus', 'Fibromyalgia' ] # Pharmaceutical companies and research institutions self.sponsors = [ 'Pfizer Inc.', 'Johnson & Johnson', 'Roche', 'Novartis', 'Merck & Co.', 'GlaxoSmithKline', 'AstraZeneca', 'Eli Lilly', 'Bristol-Myers Squibb', 'AbbVie', 'Amgen', 'Gilead Sciences', 'Biogen', 'Regeneron', 'Vertex Pharmaceuticals', 'National Institute of Health', 'National Cancer Institute', 'Mayo Clinic', 'Johns Hopkins University', 'Stanford University', 'Harvard Medical School', 'MD Anderson Cancer Center', 'Cleveland Clinic', 'Massachusetts General Hospital' ] # Study locations self.study_locations = [ 'Boston, MA', 'New York, NY', 'Philadelphia, PA', 'Baltimore, MD', 'Atlanta, GA', 'Miami, FL', 'Nashville, TN', 'Chicago, IL', 'Detroit, MI', 'Minneapolis, MN', 'Houston, TX', 'Dallas, TX', 'Phoenix, AZ', 'Denver, CO', 'Los Angeles, CA', 'San Francisco, CA', 'San Diego, CA', 'Seattle, WA', 'Portland, OR', 'Rochester, NY', 'Cleveland, OH', 'Pittsburgh, PA' ] # Trial name templates self.trial_name_templates = [ 'Efficacy of {drug} in {condition}', 'Safety and Efficacy Study of {drug} for {condition}', 'Randomized Trial of {drug} vs Placebo in {condition}', 'Long-term Safety Study of {drug}', 'Dose-Finding Study of {drug} in {condition}', 'Combination Therapy with {drug} for {condition}', 'Biomarker Study in {condition} Patients', 'Quality of Life Study in {condition}', 'Prevention Trial for {condition}', 'Extended Follow-up Study of {drug}' ] # Drug names (fictional) self.drug_names = [ 'Therapeutix-1', 'MediCure-250', 'BioHeal-X', 'PharmaSol-III', 'TreatWell-500', 'HealFast-A', 'CurePath-B', 'MediGen-Z', 'BioFix-Plus', 'TherapyMax-7', 'HealthBoost-Q', 'MediAdvance-12' ]
[docs] def generate(self) -> List[Dict[str, Any]]: if self.seed is not None: random.seed(self.seed) self.faker_utils.set_seed(self.seed) return [self._generate_row() for _ in range(self.rows)]
def _generate_row(self) -> Dict[str, Any]: # Basic trial info trial_id = f"CTR-2025-{self._trial_counter:06d}" self._trial_counter += 1 # Select condition and generate trial name condition_studied = random.choice(self.conditions_studied) drug_name = random.choice(self.drug_names) trial_name_template = random.choice(self.trial_name_templates) if '{drug}' in trial_name_template and '{condition}' in trial_name_template: trial_name = trial_name_template.format(drug=drug_name, condition=condition_studied) elif '{drug}' in trial_name_template: trial_name = trial_name_template.format(drug=drug_name) elif '{condition}' in trial_name_template: trial_name = trial_name_template.format(condition=condition_studied) else: trial_name = trial_name_template # Sponsor sponsor = random.choice(self.sponsors) # Trial dates # Trials can start from 5 years ago to 2 years in the future start_date = self.faker_utils.date_between( datetime.now() - timedelta(days=1825), # 5 years ago datetime.now() + timedelta(days=730) # 2 years in future ) # Phase and status phase = random.choice(self.trial_phases) status = random.choice(self.trial_statuses) # End date based on phase and status if status in ['Completed', 'Terminated']: # Completed/terminated trials have end dates if phase == 'Phase I': duration_months = random.randint(6, 18) elif phase == 'Phase II': duration_months = random.randint(12, 36) elif phase == 'Phase III': duration_months = random.randint(24, 60) else: # Phase IV duration_months = random.randint(12, 48) end_date = start_date + timedelta(days=duration_months * 30) elif status in ['Recruiting', 'Active']: # Some ongoing trials have planned end dates if random.random() < 0.7: # 70% have planned end dates if phase == 'Phase I': duration_months = random.randint(6, 18) elif phase == 'Phase II': duration_months = random.randint(12, 36) elif phase == 'Phase III': duration_months = random.randint(24, 60) else: # Phase IV duration_months = random.randint(12, 48) end_date = start_date + timedelta(days=duration_months * 30) else: end_date = None else: # Suspended end_date = None # Number of participants based on phase if phase == 'Phase I': number_of_participants = random.randint(20, 100) elif phase == 'Phase II': number_of_participants = random.randint(100, 300) elif phase == 'Phase III': number_of_participants = random.randint(300, 3000) else: # Phase IV number_of_participants = random.randint(500, 5000) # Principal investigator principal_investigator_id = f"PI-{random.randint(10000, 99999)}" # Study location study_location = random.choice(self.study_locations) # Protocol summary protocol_templates = [ 'This is a randomized, double-blind, placebo-controlled study evaluating the efficacy and safety of {drug} in patients with {condition}.', 'A multicenter, open-label study to assess the long-term safety of {drug} in {condition} patients.', 'Phase {phase} dose-escalation study to determine the maximum tolerated dose of {drug} in {condition}.', 'Randomized controlled trial comparing {drug} to standard of care in patients with {condition}.', 'Biomarker-driven study evaluating {drug} in patients with {condition} and specific genetic markers.', 'Quality of life assessment in {condition} patients receiving {drug} therapy.', 'Prevention study evaluating {drug} in high-risk individuals for {condition} development.' ] protocol_template = random.choice(protocol_templates) protocol_summary = protocol_template.format( drug=drug_name, condition=condition_studied.lower(), phase=phase ) # Results summary (only for completed trials) if status == 'Completed': results_templates = [ 'Primary endpoint was met with statistical significance (p<0.05).', 'Study met its primary efficacy endpoint with acceptable safety profile.', 'Treatment showed significant improvement compared to placebo.', 'No significant difference between treatment groups was observed.', 'Study was positive for primary endpoint but showed safety concerns.', 'Biomarker analysis revealed subset of patients with enhanced response.' ] results_summary = random.choice(results_templates) else: results_summary = None # Publication link (only for completed trials with results) if status == 'Completed' and results_summary and random.random() < 0.6: journal_names = ['NEJM', 'Lancet', 'JAMA', 'Nature Medicine', 'Science Translational Medicine'] journal = random.choice(journal_names) publication_link = f"https://doi.org/10.{random.randint(1000, 9999)}/{journal.lower().replace(' ', '-')}.{random.randint(2020, 2025)}.{random.randint(100000, 999999)}" else: publication_link = None return { 'trial_id': trial_id, 'trial_name': trial_name, 'sponsor': sponsor, 'start_date': start_date.strftime('%Y-%m-%d'), 'end_date': end_date.strftime('%Y-%m-%d') if end_date else None, 'phase': phase, 'status': status, 'condition_studied': condition_studied, 'number_of_participants': number_of_participants, 'principal_investigator_id': principal_investigator_id, 'study_location': study_location, 'protocol_summary': protocol_summary, 'results_summary': results_summary, 'publication_link': publication_link }
[docs] def get_schema(self) -> Dict[str, str]: return { 'trial_id': 'string', 'trial_name': 'string', 'sponsor': 'string', 'start_date': 'date', 'end_date': 'date', 'phase': 'string', 'status': 'string', 'condition_studied': 'string', 'number_of_participants': 'integer', 'principal_investigator_id': 'string', 'study_location': 'string', 'protocol_summary': 'string', 'results_summary': 'string', 'publication_link': 'string' }