Source code for tempdataset.core.datasets.web_analytics

"""
Web analytics dataset generator.

Generates realistic web analytics and traffic data.
"""

import random
from datetime import datetime, timedelta
from typing import List, Dict, Any

from .base import BaseDataset
from ..utils.faker_utils import get_faker_utils


[docs] class WebAnalyticsDataset(BaseDataset): """Web analytics dataset generator for website traffic data.""" def __init__(self, rows: int = 500): super().__init__(rows) self.faker_utils = get_faker_utils() self._init_data_lists() self._session_counter = 1 def _init_data_lists(self) -> None: self.device_types = ['Desktop', 'Mobile', 'Tablet'] self.browsers = [ 'Chrome', 'Firefox', 'Safari', 'Edge', 'Opera', 'Internet Explorer', 'Samsung Internet', 'UC Browser' ] self.operating_systems = [ 'Windows 10', 'Windows 11', 'macOS', 'Linux', 'iOS', 'Android', 'ChromeOS', 'Ubuntu' ] self.traffic_sources = ['Organic', 'Paid', 'Referral', 'Direct', 'Social', 'Email'] self.page_urls = [ '/', '/home', '/products', '/about', '/contact', '/blog', '/services', '/pricing', '/login', '/signup', '/checkout', '/cart', '/search', '/category/electronics', '/category/clothing', '/product/laptop', '/product/phone', '/support', '/faq', '/terms' ] self.referrer_domains = [ 'google.com', 'facebook.com', 'twitter.com', 'linkedin.com', 'youtube.com', 'instagram.com', 'reddit.com', 'pinterest.com', 'tiktok.com', 'bing.com', 'yahoo.com', None ] self.campaign_names = [ 'summer_sale_2025', 'black_friday', 'product_launch', 'brand_awareness', 'retargeting_campaign', 'email_newsletter', 'social_media_ads', 'search_ads', None ] self.countries = [ 'United States', 'Canada', 'United Kingdom', 'Germany', 'France', 'Australia', 'Japan', 'China', 'India', 'Brazil', 'Mexico', 'Netherlands', 'Sweden', 'Norway', 'Denmark', 'Spain', 'Italy' ] self.cities = { 'United States': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'], 'Canada': ['Toronto', 'Vancouver', 'Montreal', 'Calgary', 'Ottawa'], 'United Kingdom': ['London', 'Manchester', 'Birmingham', 'Glasgow', 'Liverpool'], 'Germany': ['Berlin', 'Munich', 'Hamburg', 'Cologne', 'Frankfurt'], 'France': ['Paris', 'Lyon', 'Marseille', 'Toulouse', 'Nice'] }
[docs] def generate(self) -> List[Dict[str, Any]]: if self.seed is not None: random.seed(self.seed) self.faker_utils.set_seed(self.seed) return [self._generate_row() for _ in range(self.rows)]
def _generate_row(self) -> Dict[str, Any]: # Basic session info session_id = f"WEB-2025-{self._session_counter:06d}" self._session_counter += 1 # User ID - 70% of sessions have user IDs (logged in users) user_id = f"USER-{random.randint(10000, 999999)}" if random.random() < 0.7 else None # Page and referrer page_url = random.choice(self.page_urls) # Referrer logic traffic_source = random.choice(self.traffic_sources) if traffic_source == 'Direct': referrer_url = None elif traffic_source == 'Organic': referrer_url = f"https://www.{random.choice(['google.com', 'bing.com'])}/search?q=keywords" else: referrer_domain = random.choice([r for r in self.referrer_domains if r is not None]) referrer_url = f"https://www.{referrer_domain}/page" if referrer_domain else None # Timestamp timestamp = self.faker_utils.date_between( datetime.now() - timedelta(days=30), datetime.now() ) # Add time component timestamp = datetime.combine( timestamp, datetime.min.time().replace( hour=random.randint(0, 23), minute=random.randint(0, 59), second=random.randint(0, 59) ) ) # Device and browser info device_type = random.choice(self.device_types) # Browser distribution based on device type if device_type == 'Mobile': browser = random.choices( ['Chrome', 'Safari', 'Samsung Internet', 'Firefox'], weights=[0.5, 0.3, 0.15, 0.05] )[0] os_options = ['iOS', 'Android'] elif device_type == 'Tablet': browser = random.choices(['Safari', 'Chrome', 'Firefox'], weights=[0.6, 0.3, 0.1])[0] os_options = ['iOS', 'Android'] else: # Desktop browser = random.choices( ['Chrome', 'Firefox', 'Safari', 'Edge'], weights=[0.65, 0.15, 0.12, 0.08] )[0] os_options = ['Windows 10', 'Windows 11', 'macOS', 'Linux'] operating_system = random.choice(os_options) # Geographic info geo_country = random.choice(self.countries) if geo_country in self.cities: geo_city = random.choice(self.cities[geo_country]) else: geo_city = self.faker_utils.city() # Session metrics page_views = random.randint(1, 20) # Session duration - varies by page views if page_views == 1: session_duration_seconds = random.randint(5, 300) # Bounce bounce_rate = 100.0 else: session_duration_seconds = random.randint(60, 3600) # 1 minute to 1 hour bounce_rate = 0.0 # Conversion conversion_occurred = random.random() < 0.03 # 3% conversion rate conversion_value = None if conversion_occurred: conversion_value = round(random.uniform(10.0, 500.0), 2) # Campaign campaign_name = None if traffic_source in ['Paid', 'Email', 'Social']: campaign_name = random.choice([c for c in self.campaign_names if c is not None]) return { 'session_id': session_id, 'user_id': user_id, 'page_url': page_url, 'referrer_url': referrer_url, 'timestamp': timestamp.strftime('%Y-%m-%d %H:%M:%S'), 'page_views': page_views, 'session_duration_seconds': session_duration_seconds, 'bounce_rate': bounce_rate, 'device_type': device_type, 'browser': browser, 'operating_system': operating_system, 'geo_country': geo_country, 'geo_city': geo_city, 'conversion_occurred': conversion_occurred, 'conversion_value': conversion_value, 'traffic_source': traffic_source, 'campaign_name': campaign_name }
[docs] def get_schema(self) -> Dict[str, str]: return { 'session_id': 'string', 'user_id': 'string', 'page_url': 'string', 'referrer_url': 'string', 'timestamp': 'datetime', 'page_views': 'integer', 'session_duration_seconds': 'integer', 'bounce_rate': 'float', 'device_type': 'string', 'browser': 'string', 'operating_system': 'string', 'geo_country': 'string', 'geo_city': 'string', 'conversion_occurred': 'boolean', 'conversion_value': 'float', 'traffic_source': 'string', 'campaign_name': 'string' }