Source code for tempdataset.core.datasets.user_profiles

"""
User Profiles dataset generator.

Generates realistic user profile data for social media platforms
with demographics, engagement metrics, and account information.
"""

import random
import string
from datetime import datetime, timedelta
from typing import List, Dict, Any

from .base import BaseDataset
from ..utils.faker_utils import get_faker_utils


[docs] class UserProfilesDataset(BaseDataset): """ User Profiles dataset generator that creates realistic social media user profiles. Generates user profiles with: - User information (user_id, username, personal details) - Account details (join_date, platform, status) - Social metrics (followers, following, posts) - Profile content (bio, interests, connections) - Geographic data (location) """ def __init__(self, rows: int = 500): """ Initialize the UserProfilesDataset generator. Args: rows: Number of rows to generate (default: 500) """ super().__init__(rows) self.faker_utils = get_faker_utils() # Initialize data for consistent generation self._init_data_lists() # Counter for sequential IDs self._user_counter = 1 def _init_data_lists(self) -> None: """Initialize predefined data lists for realistic generation.""" # Social media platforms self.platforms = [ 'Facebook', 'Instagram', 'Twitter', 'LinkedIn', 'TikTok', 'YouTube', 'Snapchat', 'Pinterest', 'Reddit', 'Discord' ] # Account statuses self.account_statuses = ['Active', 'Suspended', 'Deleted', 'Inactive'] # Common interests categories self.interest_categories = { 'hobbies': ['Photography', 'Reading', 'Gaming', 'Cooking', 'Gardening', 'Music', 'Art', 'Writing'], 'sports': ['Football', 'Basketball', 'Tennis', 'Swimming', 'Running', 'Cycling', 'Yoga', 'Hiking'], 'technology': ['Programming', 'AI', 'Blockchain', 'Mobile Apps', 'Web Development', 'Data Science'], 'lifestyle': ['Fashion', 'Beauty', 'Travel', 'Food', 'Fitness', 'Wellness', 'Home Decor'], 'entertainment': ['Movies', 'TV Shows', 'Podcasts', 'Stand-up Comedy', 'Theater', 'Concerts'], 'business': ['Entrepreneurship', 'Marketing', 'Finance', 'Leadership', 'Networking', 'Innovation'], 'education': ['Online Learning', 'Languages', 'Science', 'History', 'Philosophy', 'Psychology'] } # Bio templates by platform type self.bio_templates = { 'professional': [ "Digital marketing specialist | Coffee enthusiast | Dog lover", "Software engineer by day, photographer by night 📸", "Helping businesses grow through innovative solutions", "Passionate about technology and sustainable living", "Building the future, one line of code at a time" ], 'personal': [ "Living life one adventure at a time ✈️", "Foodie | Traveler | Always looking for the next great story", "Spreading positivity wherever I go 🌟", "Mom of two | Yoga instructor | Plant-based living", "Just trying to make the world a little brighter" ], 'creative': [ "Artist | Dreamer | Creator of beautiful things", "Capturing moments that matter 📷", "Music is my language, art is my voice", "Turning ideas into reality through design", "Storyteller with a passion for visual narratives" ], 'influencer': [ "Lifestyle blogger | Brand partnerships: email@example.com", "Fashion & beauty content creator | Link in bio 👇", "Sharing my journey to inspire yours ✨", "Authentic content | Real stories | Genuine connections", "Building a community of like-minded souls" ] } # Username patterns self.username_patterns = [ '{first_name}_{last_name}', '{first_name}{number}', '{first_name}_{word}', '{word}_{first_name}', '{first_name}{last_initial}{number}', 'the_{first_name}', '{first_name}_official' ] # Common username words self.username_words = [ 'creative', 'digital', 'real', 'official', 'pro', 'studio', 'life', 'world', 'vibes', 'daily', 'journey', 'adventures', 'stories', 'moments', 'dreams' ]
[docs] def generate(self) -> List[Dict[str, Any]]: """ Generate user profiles dataset rows. Returns: List of dictionaries representing user profile rows """ if self.seed is not None: random.seed(self.seed) self.faker_utils.set_seed(self.seed) data = [] for i in range(self.rows): row = self._generate_row() data.append(row) return data
def _generate_row(self) -> Dict[str, Any]: """Generate a single user profile row.""" # Generate basic user information full_name = self.faker_utils.name() first_name = full_name.split()[0].lower() last_name = full_name.split()[-1].lower() # Generate join date (within last 5 years) end_date = datetime.now() start_date = end_date - timedelta(days=1825) # 5 years join_date = self.faker_utils.date_between(start_date, end_date) # Generate platform platform = random.choice(self.platforms) # Generate username username = self._generate_username(first_name, last_name) # Generate email email = self.faker_utils.email(full_name) # Generate social metrics based on account age and platform account_age_days = (datetime.now().date() - join_date).days social_metrics = self._generate_social_metrics(platform, account_age_days) # Generate bio and interests bio = self._generate_bio() interests = self._generate_interests() # Generate account status (most are active) account_status = self._generate_account_status() # Generate verification status (rare) verified = self._generate_verification_status(social_metrics['followers_count']) # Generate location location_data = self._generate_location() if random.random() < 0.7 else (None, None) # Generate profile picture URL profile_picture_url = self._generate_profile_picture_url() # Generate connections (for platforms like LinkedIn) connections = self._generate_connections(platform, social_metrics['followers_count']) return { 'user_id': self._generate_user_id(join_date), 'username': username, 'full_name': full_name, 'email': email, 'join_date': join_date.strftime('%Y-%m-%d'), 'platform': platform, 'followers_count': social_metrics['followers_count'], 'following_count': social_metrics['following_count'], 'total_posts': social_metrics['total_posts'], 'bio': bio, 'profile_picture_url': profile_picture_url, 'account_status': account_status, 'verified': verified, 'location_country': location_data[0], 'location_city': location_data[1], 'interests': interests, 'connections': connections } def _generate_user_id(self, join_date) -> str: """ Generate user ID in format "USER-YYYY-NNNNNN". Args: join_date: Date when user joined Returns: Formatted user ID """ year = join_date.year user_num = str(self._user_counter).zfill(6) self._user_counter += 1 return f"USER-{year}-{user_num}" def _generate_username(self, first_name: str, last_name: str) -> str: """ Generate username based on name and patterns. Args: first_name: User's first name last_name: User's last name Returns: Generated username """ pattern = random.choice(self.username_patterns) # Replace placeholders username = pattern.format( first_name=first_name, last_name=last_name, last_initial=last_name[0] if last_name else 'x', number=random.randint(1, 999), word=random.choice(self.username_words) ) # Ensure username is not too long if len(username) > 20: username = f"{first_name}{random.randint(1, 999)}" return username def _generate_social_metrics(self, platform: str, account_age_days: int) -> Dict[str, int]: """ Generate realistic social metrics based on platform and account age. Args: platform: Social media platform account_age_days: Age of account in days Returns: Dictionary with social metrics """ # Base ranges by platform platform_ranges = { 'Instagram': {'followers': (50, 5000), 'following': (100, 1500), 'posts': (10, 2000)}, 'Facebook': {'followers': (20, 2000), 'following': (50, 800), 'posts': (5, 1000)}, 'Twitter': {'followers': (30, 3000), 'following': (100, 2000), 'posts': (50, 5000)}, 'TikTok': {'followers': (100, 10000), 'following': (200, 1000), 'posts': (20, 500)}, 'LinkedIn': {'followers': (100, 5000), 'following': (200, 2000), 'posts': (10, 500)}, 'YouTube': {'followers': (10, 1000), 'following': (50, 500), 'posts': (5, 200)}, } # Default ranges for other platforms default_range = {'followers': (50, 2000), 'following': (100, 1000), 'posts': (10, 800)} ranges = platform_ranges.get(platform, default_range) # Adjust based on account age (older accounts tend to have more activity) age_multiplier = min(1.0 + (account_age_days / 365) * 0.3, 2.0) # Max 2x multiplier followers_count = int(random.randint(*ranges['followers']) * age_multiplier) following_count = int(random.randint(*ranges['following']) * age_multiplier) total_posts = int(random.randint(*ranges['posts']) * age_multiplier) # Ensure following is not much higher than followers for most users if following_count > followers_count * 3 and random.random() < 0.7: following_count = int(followers_count * random.uniform(0.5, 2.0)) return { 'followers_count': followers_count, 'following_count': following_count, 'total_posts': total_posts } def _generate_bio(self) -> str: """ Generate bio text. Returns: Bio text or None for some users """ # Some users don't have bios if random.random() < 0.3: return None # Select bio type bio_type = random.choice(['professional', 'personal', 'creative', 'influencer']) bio = random.choice(self.bio_templates[bio_type]) return bio def _generate_interests(self) -> str: """ Generate user interests. Returns: Comma-separated string of interests """ num_interests = random.randint(2, 8) # Select interests from different categories all_interests = [] selected_categories = random.sample( list(self.interest_categories.keys()), min(3, len(self.interest_categories)) ) for category in selected_categories: all_interests.extend(self.interest_categories[category]) selected_interests = random.sample(all_interests, min(num_interests, len(all_interests))) return ", ".join(selected_interests) def _generate_account_status(self) -> str: """ Generate account status with realistic distribution. Returns: Account status """ # Most accounts are active weights = [0.85, 0.05, 0.02, 0.08] # Active, Suspended, Deleted, Inactive return random.choices(self.account_statuses, weights=weights)[0] def _generate_verification_status(self, followers_count: int) -> bool: """ Generate verification status based on follower count. Args: followers_count: Number of followers Returns: Verification status """ # Higher follower count increases chance of verification if followers_count > 10000: return random.random() < 0.3 elif followers_count > 5000: return random.random() < 0.1 elif followers_count > 1000: return random.random() < 0.02 else: return random.random() < 0.001 def _generate_location(self) -> tuple: """ Generate location data. Returns: Tuple of (country, city) """ country = self.faker_utils.country() city = self.faker_utils.city() return (country, city) def _generate_profile_picture_url(self) -> str: """ Generate profile picture URL. Returns: Profile picture URL or None """ # Some users don't have profile pictures if random.random() < 0.15: return None return f"https://example.com/profiles/{random.randint(1000, 9999)}.jpg" def _generate_connections(self, platform: str, followers_count: int) -> int: """ Generate connections count (mainly for LinkedIn-style platforms). Args: platform: Social media platform followers_count: Number of followers Returns: Number of connections """ if platform == 'LinkedIn': # LinkedIn connections are typically lower than followers base_connections = int(followers_count * random.uniform(0.3, 0.8)) return max(base_connections, random.randint(50, 500)) else: # For other platforms, connections might be similar to followers return int(followers_count * random.uniform(0.1, 0.5))
[docs] def get_schema(self) -> Dict[str, str]: """ Return column schema with types. Returns: Dictionary mapping column names to their data types """ return { 'user_id': 'string', 'username': 'string', 'full_name': 'string', 'email': 'string', 'join_date': 'date', 'platform': 'string', 'followers_count': 'integer', 'following_count': 'integer', 'total_posts': 'integer', 'bio': 'string', 'profile_picture_url': 'string', 'account_status': 'string', 'verified': 'boolean', 'location_country': 'string', 'location_city': 'string', 'interests': 'string', 'connections': 'integer' }