Source code for tempdataset.core.datasets.social_media

"""
Social Media dataset generator.

Generates realistic social media post data with engagement metrics,
content analysis, and geographic information.
"""

import random
import string
from datetime import datetime, timedelta
from typing import List, Dict, Any

from .base import BaseDataset
from ..utils.faker_utils import get_faker_utils


[docs] class SocialMediaDataset(BaseDataset): """ Social Media dataset generator that creates realistic social media post data. Generates social media posts with: - Post information (post_id, dates, type, content) - User details (user_id, platform) - Engagement metrics (likes, comments, shares, views) - Content analysis (hashtags, mentions, sentiment) - Geographic data (location) """ def __init__(self, rows: int = 500): """ Initialize the SocialMediaDataset generator. Args: rows: Number of rows to generate (default: 500) """ super().__init__(rows) self.faker_utils = get_faker_utils() # Initialize data for consistent generation self._init_data_lists() # Counter for sequential IDs self._post_counter = 1 def _init_data_lists(self) -> None: """Initialize predefined data lists for realistic generation.""" # Social media platforms self.platforms = [ 'Facebook', 'Instagram', 'Twitter', 'LinkedIn', 'TikTok', 'YouTube', 'Snapchat', 'Pinterest', 'Reddit', 'Discord' ] # Post types self.post_types = ['Text', 'Image', 'Video', 'Link', 'Story', 'Poll', 'Live', 'Reel'] # Sentiment categories self.sentiments = ['Positive', 'Neutral', 'Negative'] # Popular hashtags by category self.hashtag_categories = { 'lifestyle': ['#lifestyle', '#daily', '#mood', '#vibes', '#blessed', '#grateful'], 'food': ['#food', '#foodie', '#delicious', '#yummy', '#cooking', '#recipe'], 'travel': ['#travel', '#vacation', '#wanderlust', '#explore', '#adventure', '#trip'], 'fitness': ['#fitness', '#workout', '#gym', '#health', '#motivation', '#fit'], 'business': ['#business', '#entrepreneur', '#success', '#marketing', '#growth', '#startup'], 'tech': ['#tech', '#technology', '#innovation', '#digital', '#ai', '#coding'], 'fashion': ['#fashion', '#style', '#ootd', '#trendy', '#shopping', '#outfit'], 'nature': ['#nature', '#outdoors', '#sunset', '#beautiful', '#photography', '#landscape'] } # Sample content templates by post type self.content_templates = { 'Text': [ "Just had an amazing day! Feeling grateful for all the good things in life.", "Excited to share some big news with everyone! Stay tuned for updates.", "Monday motivation: Every day is a new opportunity to grow and learn.", "Reflecting on the weekend and all the memories made with friends and family.", "Sometimes the smallest moments bring the greatest joy." ], 'Image': [ "Check out this beautiful sunset from my evening walk!", "Homemade dinner turned out better than expected. Recipe in comments!", "New haircut, new me! Thanks to my amazing stylist.", "Weekend vibes with the best company. Love these people!", "Finally finished this project I've been working on for months." ], 'Video': [ "Quick tutorial on how to make the perfect morning smoothie!", "Behind the scenes of today's photoshoot. So much fun!", "Dancing to my favorite song because it's Friday!", "Time-lapse of my latest art project coming together.", "Workout routine that's been keeping me motivated lately." ], 'Link': [ "Found this incredible article about sustainable living. Must read!", "New blog post is live! Sharing my thoughts on work-life balance.", "This podcast episode completely changed my perspective on creativity.", "Amazing documentary about ocean conservation. Link in bio!", "Just discovered this app that's been a game-changer for productivity." ], 'Story': [ "Coffee shop adventures this morning ☕", "Quick gym session before work 💪", "Trying out a new recipe tonight 🍝", "Beautiful weather for a walk in the park 🌞", "Late night coding session in progress 💻" ] } # Common mention patterns self.mention_patterns = [ '@friend', '@family_member', '@colleague', '@brand', '@influencer', '@company', '@restaurant', '@gym', '@store', '@artist' ]
[docs] def generate(self) -> List[Dict[str, Any]]: """ Generate social media dataset rows. Returns: List of dictionaries representing social media post rows """ if self.seed is not None: random.seed(self.seed) self.faker_utils.set_seed(self.seed) data = [] for i in range(self.rows): row = self._generate_row() data.append(row) return data
def _generate_row(self) -> Dict[str, Any]: """Generate a single social media post row.""" # Generate post date (within last year) end_date = datetime.now() start_date = end_date - timedelta(days=365) post_date = self.faker_utils.date_between(start_date, end_date) # Generate platform and post type platform = random.choice(self.platforms) post_type = random.choice(self.post_types) # Generate content based on post type content_text = self._generate_content(post_type) media_url = self._generate_media_url(post_type) # Generate engagement metrics based on platform and content quality engagement_metrics = self._generate_engagement_metrics(platform, post_type) # Generate hashtags and mentions hashtags = self._generate_hashtags() mentions = self._generate_mentions() # Generate sentiment sentiment = self._generate_sentiment(content_text) # Generate location (some posts have location, some don't) location_data = self._generate_location() if random.random() < 0.6 else (None, None) return { 'post_id': self._generate_post_id(post_date), 'user_id': self._generate_user_id(), 'platform': platform, 'post_date': post_date.strftime('%Y-%m-%d %H:%M:%S'), 'post_type': post_type, 'content_text': content_text, 'media_url': media_url, 'likes_count': engagement_metrics['likes'], 'comments_count': engagement_metrics['comments'], 'shares_count': engagement_metrics['shares'], 'views_count': engagement_metrics['views'], 'hashtags': hashtags, 'mentions': mentions, 'engagement_rate_percent': engagement_metrics['engagement_rate'], 'location_country': location_data[0], 'location_city': location_data[1], 'sentiment': sentiment } def _generate_post_id(self, post_date: datetime) -> str: """ Generate post ID in format "POST-YYYY-NNNNNN". Args: post_date: Date of the post Returns: Formatted post ID """ year = post_date.year post_num = str(self._post_counter).zfill(6) self._post_counter += 1 return f"POST-{year}-{post_num}" def _generate_user_id(self) -> str: """ Generate user ID in format "USER-YYYY-NNNNNN". Returns: Formatted user ID """ year = random.randint(2020, 2025) user_num = str(random.randint(1, 999999)).zfill(6) return f"USER-{year}-{user_num}" def _generate_content(self, post_type: str) -> str: """ Generate content text based on post type. Args: post_type: Type of the post Returns: Content text or None for media-only posts """ if post_type in self.content_templates: base_content = random.choice(self.content_templates[post_type]) # Sometimes return None for pure media posts if post_type in ['Image', 'Video'] and random.random() < 0.2: return None return base_content return "Check out this amazing content!" def _generate_media_url(self, post_type: str) -> str: """ Generate media URL based on post type. Args: post_type: Type of the post Returns: Media URL or None for text posts """ if post_type == 'Text': return None if random.random() < 0.8 else f"https://example.com/media/{random.randint(1000, 9999)}.jpg" elif post_type == 'Image': return f"https://example.com/images/{random.randint(1000, 9999)}.jpg" elif post_type == 'Video': return f"https://example.com/videos/{random.randint(1000, 9999)}.mp4" elif post_type == 'Link': domains = ['example.com', 'blog.example.com', 'news.example.com', 'shop.example.com'] return f"https://{random.choice(domains)}/article/{random.randint(100, 999)}" else: return f"https://example.com/media/{random.randint(1000, 9999)}.jpg" if random.random() < 0.7 else None def _generate_engagement_metrics(self, platform: str, post_type: str) -> Dict[str, Any]: """ Generate realistic engagement metrics based on platform and post type. Args: platform: Social media platform post_type: Type of the post Returns: Dictionary with engagement metrics """ # Base engagement ranges by platform platform_ranges = { 'Instagram': {'likes': (10, 500), 'comments': (1, 50), 'shares': (0, 20)}, 'Facebook': {'likes': (5, 300), 'comments': (0, 30), 'shares': (0, 50)}, 'Twitter': {'likes': (2, 200), 'comments': (0, 25), 'shares': (0, 100)}, 'TikTok': {'likes': (50, 1000), 'comments': (5, 100), 'shares': (2, 200)}, 'LinkedIn': {'likes': (3, 150), 'comments': (0, 20), 'shares': (0, 30)}, 'YouTube': {'likes': (20, 800), 'comments': (2, 80), 'shares': (1, 40)}, } # Default ranges for other platforms default_range = {'likes': (5, 250), 'comments': (0, 25), 'shares': (0, 30)} ranges = platform_ranges.get(platform, default_range) # Generate base metrics likes = random.randint(*ranges['likes']) comments = random.randint(*ranges['comments']) shares = random.randint(*ranges['shares']) # Adjust based on post type if post_type == 'Video': likes = int(likes * random.uniform(1.2, 2.0)) comments = int(comments * random.uniform(1.1, 1.5)) shares = int(shares * random.uniform(1.3, 1.8)) elif post_type == 'Image': likes = int(likes * random.uniform(1.1, 1.6)) elif post_type == 'Text': likes = int(likes * random.uniform(0.7, 1.2)) comments = int(comments * random.uniform(1.2, 1.8)) # Generate views (higher than engagement) total_engagement = likes + comments + shares views = int(total_engagement * random.uniform(5, 20)) if total_engagement > 0 else random.randint(10, 100) # Calculate engagement rate engagement_rate = round((total_engagement / views * 100), 2) if views > 0 else 0.0 return { 'likes': likes, 'comments': comments, 'shares': shares, 'views': views, 'engagement_rate': engagement_rate } def _generate_hashtags(self) -> str: """ Generate hashtags for the post. Returns: Comma-separated string of hashtags """ # Randomly select a category and number of hashtags num_hashtags = random.randint(0, 8) if num_hashtags == 0: return "" # Select hashtags from different categories all_hashtags = [] for category in random.sample(list(self.hashtag_categories.keys()), min(3, len(self.hashtag_categories))): all_hashtags.extend(self.hashtag_categories[category]) selected_hashtags = random.sample(all_hashtags, min(num_hashtags, len(all_hashtags))) return ", ".join(selected_hashtags) def _generate_mentions(self) -> str: """ Generate mentions for the post. Returns: Comma-separated string of mentions """ num_mentions = random.randint(0, 4) if num_mentions == 0: return "" mentions = [] for _ in range(num_mentions): base_mention = random.choice(self.mention_patterns) mention = f"{base_mention}{random.randint(1, 999)}" mentions.append(mention) return ", ".join(mentions) def _generate_sentiment(self, content_text: str) -> str: """ Generate sentiment based on content. Args: content_text: The post content Returns: Sentiment category """ if content_text is None: return random.choice(self.sentiments) # Simple sentiment analysis based on keywords positive_words = ['amazing', 'great', 'love', 'excited', 'happy', 'grateful', 'wonderful', 'perfect', 'best'] negative_words = ['bad', 'terrible', 'hate', 'angry', 'sad', 'disappointed', 'worst', 'awful'] content_lower = content_text.lower() positive_count = sum(1 for word in positive_words if word in content_lower) negative_count = sum(1 for word in negative_words if word in content_lower) if positive_count > negative_count: return 'Positive' elif negative_count > positive_count: return 'Negative' else: return 'Neutral' def _generate_location(self) -> tuple: """ Generate location data. Returns: Tuple of (country, city) """ country = self.faker_utils.country() city = self.faker_utils.city() return (country, city)
[docs] def get_schema(self) -> Dict[str, str]: """ Return column schema with types. Returns: Dictionary mapping column names to their data types """ return { 'post_id': 'string', 'user_id': 'string', 'platform': 'string', 'post_date': 'datetime', 'post_type': 'string', 'content_text': 'string', 'media_url': 'string', 'likes_count': 'integer', 'comments_count': 'integer', 'shares_count': 'integer', 'views_count': 'integer', 'hashtags': 'string', 'mentions': 'string', 'engagement_rate_percent': 'float', 'location_country': 'string', 'location_city': 'string', 'sentiment': 'string' }