Source code for tempdataset.core.datasets.user_sessions

"""
User sessions dataset generator.

Generates realistic user session tracking data.
"""

import random
from datetime import datetime, timedelta
from typing import List, Dict, Any

from .base import BaseDataset
from ..utils.faker_utils import get_faker_utils


[docs] class UserSessionsDataset(BaseDataset): """User sessions dataset generator for user behavior analytics and session tracking.""" def __init__(self, rows: int = 500): super().__init__(rows) self.faker_utils = get_faker_utils() self._init_data_lists() self._session_counter = 1 def _init_data_lists(self) -> None: self.devices = ['Desktop', 'Mobile', 'Tablet'] self.operating_systems = [ 'Windows 10', 'Windows 11', 'macOS Monterey', 'macOS Ventura', 'macOS Sonoma', 'iOS 16', 'iOS 17', 'Android 12', 'Android 13', 'Android 14', 'Ubuntu 20.04', 'Ubuntu 22.04' ] self.browsers = [ 'Chrome 120', 'Chrome 121', 'Firefox 121', 'Safari 17', 'Edge 120', 'Opera 106', 'Mobile Safari', 'Chrome Mobile' ] self.countries = [ 'United States', 'United Kingdom', 'Canada', 'Germany', 'France', 'Japan', 'Australia', 'India', 'Brazil', 'Spain', 'Italy', 'Netherlands' ] self.cities = { 'United States': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'], 'United Kingdom': ['London', 'Manchester', 'Birmingham', 'Liverpool', 'Bristol'], 'Canada': ['Toronto', 'Vancouver', 'Montreal', 'Calgary', 'Ottawa'], 'Germany': ['Berlin', 'Munich', 'Hamburg', 'Frankfurt', 'Cologne'], 'France': ['Paris', 'Lyon', 'Marseille', 'Toulouse', 'Nice'], 'Japan': ['Tokyo', 'Osaka', 'Yokohama', 'Nagoya', 'Sapporo'], 'Australia': ['Sydney', 'Melbourne', 'Brisbane', 'Perth', 'Adelaide'], 'India': ['Mumbai', 'Delhi', 'Bangalore', 'Chennai', 'Kolkata'], 'Brazil': ['São Paulo', 'Rio de Janeiro', 'Brasília', 'Salvador', 'Fortaleza'], 'Spain': ['Madrid', 'Barcelona', 'Valencia', 'Seville', 'Zaragoza'], 'Italy': ['Rome', 'Milan', 'Naples', 'Turin', 'Florence'], 'Netherlands': ['Amsterdam', 'Rotterdam', 'The Hague', 'Utrecht', 'Eindhoven'] } self.traffic_sources = [ 'Direct', 'Google Search', 'Bing Search', 'Social Media', 'Email Campaign', 'Referral', 'Paid Ads', 'Newsletter', 'YouTube', 'LinkedIn' ] self.utm_campaigns = [ 'summer_sale_2024', 'new_user_welcome', 'holiday_promotion', 'product_launch', 'retargeting_campaign', 'brand_awareness', 'seasonal_offer', 'loyalty_program', None ] self.session_outcomes = [ 'conversion', 'bounce', 'engaged_browsing', 'abandoned_cart', 'newsletter_signup', 'account_creation', 'support_contact' ]
[docs] def generate(self) -> List[Dict[str, Any]]: if self.seed is not None: random.seed(self.seed) self.faker_utils.set_seed(self.seed) return [self._generate_row() for _ in range(self.rows)]
def _generate_row(self) -> Dict[str, Any]: # Basic session info session_id = f"SESS-2025-{self._session_counter:08d}" self._session_counter += 1 # User ID - some sessions are anonymous user_id = f"USER-{random.randint(10000, 99999)}" if random.random() < 0.7 else None is_authenticated = user_id is not None # Session timing session_start = self.faker_utils.date_between( datetime.now() - timedelta(days=30), datetime.now() ) session_start = datetime.combine( session_start, datetime.min.time().replace( hour=random.randint(0, 23), minute=random.randint(0, 59), second=random.randint(0, 59) ) ) # Session duration (in minutes) - varies by outcome if random.random() < 0.15: # Bounce sessions duration_minutes = random.randint(1, 3) outcome = 'bounce' elif random.random() < 0.1: # Conversion sessions duration_minutes = random.randint(15, 120) outcome = 'conversion' else: # Regular sessions duration_minutes = random.randint(3, 60) outcome = random.choice(self.session_outcomes[2:]) # Excluding conversion and bounce session_end = session_start + timedelta(minutes=duration_minutes) # Device and browser info device_type = random.choices( self.devices, weights=[0.55, 0.35, 0.10] # Desktop, Mobile, Tablet )[0] if device_type == 'Desktop': operating_system = random.choice([os for os in self.operating_systems if 'Windows' in os or 'macOS' in os or 'Ubuntu' in os]) browser = random.choice([b for b in self.browsers if 'Mobile' not in b]) elif device_type == 'Mobile': operating_system = random.choice([os for os in self.operating_systems if 'iOS' in os or 'Android' in os]) browser = random.choice(['Mobile Safari', 'Chrome Mobile']) else: # Tablet operating_system = random.choice([os for os in self.operating_systems if 'iOS' in os or 'Android' in os]) browser = random.choice(['Safari 17', 'Chrome Mobile']) # Location country = random.choice(self.countries) city = random.choice(self.cities[country]) # Traffic source and campaign traffic_source = random.choice(self.traffic_sources) utm_campaign = random.choice(self.utm_campaigns) if traffic_source != 'Direct' else None # Page metrics page_views = random.randint(1, 25) if outcome == 'bounce': page_views = 1 elif outcome == 'conversion': page_views = random.randint(5, 25) # Engagement metrics bounce_rate = 1.0 if outcome == 'bounce' else 0.0 # Entry and exit pages entry_pages = ['/home', '/products', '/about', '/login', '/search', '/blog'] exit_pages = ['/home', '/products', '/checkout', '/contact', '/logout', '/404'] entry_page = random.choice(entry_pages) if outcome == 'conversion': exit_page = '/checkout' elif outcome == 'bounce': exit_page = entry_page else: exit_page = random.choice(exit_pages) # Conversion data conversion_flag = outcome == 'conversion' conversion_value = round(random.uniform(25.0, 500.0), 2) if conversion_flag else 0.0 # Returning vs new user is_returning_user = random.random() < 0.4 if is_authenticated else False return { 'session_id': session_id, 'user_id': user_id, 'session_start': session_start.strftime('%Y-%m-%d %H:%M:%S'), 'session_end': session_end.strftime('%Y-%m-%d %H:%M:%S'), 'duration_minutes': duration_minutes, 'device_type': device_type, 'operating_system': operating_system, 'browser': browser, 'country': country, 'city': city, 'traffic_source': traffic_source, 'utm_campaign': utm_campaign, 'entry_page': entry_page, 'exit_page': exit_page, 'page_views': page_views, 'bounce_rate': bounce_rate, 'conversion_flag': conversion_flag, 'conversion_value': conversion_value, 'is_authenticated': is_authenticated, 'is_returning_user': is_returning_user }
[docs] def get_schema(self) -> Dict[str, str]: return { 'session_id': 'string', 'user_id': 'string', 'session_start': 'datetime', 'session_end': 'datetime', 'duration_minutes': 'integer', 'device_type': 'string', 'operating_system': 'string', 'browser': 'string', 'country': 'string', 'city': 'string', 'traffic_source': 'string', 'utm_campaign': 'string', 'entry_page': 'string', 'exit_page': 'string', 'page_views': 'integer', 'bounce_rate': 'float', 'conversion_flag': 'boolean', 'conversion_value': 'float', 'is_authenticated': 'boolean', 'is_returning_user': 'boolean' }