Source code for tempdataset.core.datasets.banking

"""
Banking dataset generator.

Generates realistic banking transaction data with account details and fraud detection.
"""

import random
import string
from datetime import datetime, timedelta
from typing import List, Dict, Any, Optional

from .base import BaseDataset
from ..utils.faker_utils import get_faker_utils


[docs] class BankingDataset(BaseDataset): """ Banking dataset generator that creates realistic banking transaction data. Generates 20 columns of banking data including: - Transaction details (ID, type, amount, currency) - Account information (account ID, type, balances) - Merchant and location data - Fraud detection indicators """ def __init__(self, rows: int = 500): """ Initialize the BankingDataset generator. Args: rows: Number of rows to generate (default: 500) """ super().__init__(rows) self.faker_utils = get_faker_utils() # Initialize data for consistent generation self._init_data_lists() # Counters for sequential IDs self._transaction_counter = 1 self._account_counter = 1000 self._customer_counter = 1 def _init_data_lists(self) -> None: """Initialize predefined data lists for realistic generation.""" # Account types self.account_types = ['Checking', 'Savings', 'Business', 'Money Market', 'Certificate of Deposit'] # Transaction types self.transaction_types = ['Deposit', 'Withdrawal', 'Transfer', 'Payment', 'Fee', 'Interest', 'Refund'] # Merchant categories self.merchant_categories = [ 'Grocery Stores', 'Gas Stations', 'Restaurants', 'Retail Stores', 'Online Shopping', 'ATM Withdrawals', 'Bank Transfers', 'Utility Payments', 'Insurance Payments', 'Healthcare', 'Entertainment', 'Travel', 'Education', 'Professional Services', 'Home Improvement', 'Automotive', 'Clothing', 'Electronics', 'Pharmacy', 'Subscription Services' ] # Merchants by category self.merchants = { 'Grocery Stores': ['Walmart Supercenter', 'Target', 'Kroger', 'Safeway', 'Whole Foods Market', 'Costco Wholesale'], 'Gas Stations': ['Shell', 'Exxon', 'BP', 'Chevron', 'Mobil', '76', 'Arco', 'Texaco'], 'Restaurants': ['McDonald\'s', 'Starbucks', 'Subway', 'Pizza Hut', 'KFC', 'Taco Bell', 'Chipotle', 'Panera Bread'], 'Retail Stores': ['Amazon', 'Best Buy', 'Home Depot', 'Lowe\'s', 'Macy\'s', 'Target', 'CVS Pharmacy'], 'Online Shopping': ['Amazon.com', 'eBay', 'PayPal', 'Apple Store', 'Google Play', 'Netflix', 'Spotify'], 'ATM Withdrawals': ['Bank of America ATM', 'Chase ATM', 'Wells Fargo ATM', 'Citibank ATM', 'Local Credit Union ATM'], 'Bank Transfers': ['Zelle Transfer', 'Wire Transfer', 'ACH Transfer', 'Online Banking Transfer'], 'Utility Payments': ['Electric Company', 'Gas Company', 'Water Department', 'Internet Provider', 'Cable Company'], 'Insurance Payments': ['Auto Insurance', 'Health Insurance', 'Home Insurance', 'Life Insurance'], 'Healthcare': ['Medical Center', 'Dental Office', 'Pharmacy', 'Urgent Care', 'Specialist Clinic'], 'Entertainment': ['Movie Theater', 'Concert Venue', 'Sports Arena', 'Theme Park', 'Streaming Service'], 'Travel': ['Airport', 'Hotel', 'Rental Car', 'Uber', 'Lyft', 'Airline'], 'Education': ['University Tuition', 'Student Loans', 'Online Course', 'Textbook Store'], 'Professional Services': ['Legal Services', 'Accounting Firm', 'Consulting', 'Real Estate'], 'Home Improvement': ['Home Depot', 'Lowe\'s', 'Hardware Store', 'Contractor Payment'], 'Automotive': ['Auto Dealership', 'Car Repair Shop', 'Auto Parts Store', 'Car Wash'], 'Clothing': ['Department Store', 'Fashion Retailer', 'Shoe Store', 'Online Clothing'], 'Electronics': ['Best Buy', 'Apple Store', 'Electronics Store', 'Computer Store'], 'Pharmacy': ['CVS Pharmacy', 'Walgreens', 'Rite Aid', 'Local Pharmacy'], 'Subscription Services': ['Netflix', 'Spotify', 'Amazon Prime', 'Gym Membership', 'Magazine Subscription'] } # Currencies self.currencies = ['USD', 'EUR', 'GBP', 'CAD', 'AUD', 'JPY', 'CHF', 'CNY'] # Fraud reasons self.fraud_reasons = [ 'Suspicious location', 'Unusual spending pattern', 'Card not present', 'Multiple failed PIN attempts', 'High-risk merchant', 'Velocity check failed', 'Blacklisted merchant', 'Suspicious time of transaction', 'Amount threshold exceeded' ] # Branch locations self.branch_locations = [ 'Downtown Branch', 'Mall Branch', 'Airport Branch', 'University Branch', 'Business District Branch', 'Suburban Branch', 'Highway Branch', 'Shopping Center Branch' ]
[docs] def generate(self) -> List[Dict[str, Any]]: """ Generate banking dataset rows. Returns: List of dictionaries representing banking transaction rows """ if self.seed is not None: random.seed(self.seed) self.faker_utils.set_seed(self.seed) data = [] for i in range(self.rows): row = self._generate_row() data.append(row) return data
def _generate_row(self) -> Dict[str, Any]: """Generate a single banking transaction row.""" # Generate transaction date (within last year) end_date = datetime.now() start_date = end_date - timedelta(days=365) transaction_date = self.faker_utils.date_between(start_date, end_date) # Generate account information account_type = random.choice(self.account_types) transaction_type = random.choice(self.transaction_types) # Generate transaction amount based on type and account transaction_amount = self._generate_transaction_amount(transaction_type, account_type) # Generate merchant information (not all transactions have merchants) merchant_name = None merchant_category = None if transaction_type in ['Payment', 'Withdrawal'] and random.random() > 0.3: merchant_category = random.choice(self.merchant_categories) if merchant_category in self.merchants: merchant_name = random.choice(self.merchants[merchant_category]) # Generate balances balance_before = random.uniform(100, 50000) if transaction_type in ['Deposit', 'Interest', 'Refund']: balance_after = balance_before + transaction_amount else: balance_after = balance_before - transaction_amount # Ensure balance doesn't go negative (with some overdraft allowance) if balance_after < -500: balance_after = random.uniform(-500, 0) balance_before = balance_after + transaction_amount # Generate location information country = self.faker_utils.country() city = self.faker_utils.city() postal_code = self.faker_utils.postal_code() # Generate fraud detection fraud_flag = self._determine_fraud_flag(transaction_amount, merchant_category, transaction_type) fraud_reason = None if fraud_flag: fraud_reason = random.choice(self.fraud_reasons) # Generate branch information branch_location = random.choice(self.branch_locations) # Currency (mostly local currency with some international) currency = random.choice(self.currencies) if random.random() < 0.1 else 'USD' # Notes (occasionally present) notes = None if random.random() < 0.15: note_options = [ 'Customer initiated transfer', 'Automatic bill payment', 'Mobile deposit', 'ATM transaction', 'Online banking transaction', 'Branch transaction', 'Phone banking transaction', 'Recurring payment', 'One-time payment' ] notes = random.choice(note_options) return { 'transaction_id': self._generate_transaction_id(), 'account_id': self._generate_account_id(), 'customer_id': self._generate_customer_id(), 'account_type': account_type, 'transaction_date': transaction_date.strftime('%Y-%m-%d %H:%M:%S'), 'transaction_type': transaction_type, 'transaction_amount': round(transaction_amount, 2), 'currency': currency, 'merchant_name': merchant_name, 'merchant_category': merchant_category, 'balance_before': round(balance_before, 2), 'balance_after': round(balance_after, 2), 'branch_id': self._generate_branch_id(), 'branch_location': branch_location, 'fraud_flag': fraud_flag, 'fraud_reason': fraud_reason, 'country': country, 'city': city, 'postal_code': postal_code, 'notes': notes } def _generate_transaction_id(self) -> str: """Generate transaction ID in format TXN-YYYYMMDD-NNNNNN.""" date_part = datetime.now().strftime('%Y%m%d') number_part = str(self._transaction_counter).zfill(6) self._transaction_counter += 1 return f"TXN-{date_part}-{number_part}" def _generate_account_id(self) -> str: """Generate account ID in format ACC-NNNNNNN.""" account_num = str(self._account_counter).zfill(7) self._account_counter += 1 return f"ACC-{account_num}" def _generate_customer_id(self) -> str: """Generate customer ID in format CUST-NNNNNN.""" customer_num = str(self._customer_counter).zfill(6) self._customer_counter += 1 return f"CUST-{customer_num}" def _generate_branch_id(self) -> str: """Generate branch ID in format BR-NNN.""" branch_num = str(random.randint(1, 999)).zfill(3) return f"BR-{branch_num}" def _generate_transaction_amount(self, transaction_type: str, account_type: str) -> float: """ Generate realistic transaction amount based on type and account. Args: transaction_type: Type of transaction account_type: Type of account Returns: Transaction amount """ base_ranges = { 'Deposit': (50, 5000), 'Withdrawal': (20, 1000), 'Transfer': (100, 10000), 'Payment': (15, 2000), 'Fee': (5, 50), 'Interest': (1, 100), 'Refund': (10, 500) } # Adjust ranges based on account type if account_type == 'Business': multiplier = random.uniform(2, 10) elif account_type == 'Savings': multiplier = random.uniform(1.5, 3) else: multiplier = 1 min_amount, max_amount = base_ranges.get(transaction_type, (10, 1000)) amount = random.uniform(min_amount, max_amount) * multiplier return amount def _determine_fraud_flag(self, amount: float, merchant_category: str, transaction_type: str) -> bool: """ Determine if transaction should be flagged as fraudulent. Args: amount: Transaction amount merchant_category: Merchant category transaction_type: Transaction type Returns: True if flagged as fraud, False otherwise """ fraud_probability = 0.02 # Base 2% fraud rate # Increase fraud probability for high amounts if amount > 5000: fraud_probability += 0.05 elif amount > 2000: fraud_probability += 0.02 # Increase fraud probability for certain categories high_risk_categories = ['Online Shopping', 'Electronics', 'Travel', 'Entertainment'] if merchant_category in high_risk_categories: fraud_probability += 0.03 # Increase fraud probability for withdrawals if transaction_type == 'Withdrawal': fraud_probability += 0.01 return random.random() < fraud_probability
[docs] def get_schema(self) -> Dict[str, str]: """ Return column schema with types. Returns: Dictionary mapping column names to their data types """ return { 'transaction_id': 'string', 'account_id': 'string', 'customer_id': 'string', 'account_type': 'string', 'transaction_date': 'datetime', 'transaction_type': 'string', 'transaction_amount': 'float', 'currency': 'string', 'merchant_name': 'string', 'merchant_category': 'string', 'balance_before': 'float', 'balance_after': 'float', 'branch_id': 'string', 'branch_location': 'string', 'fraud_flag': 'boolean', 'fraud_reason': 'string', 'country': 'string', 'city': 'string', 'postal_code': 'string', 'notes': 'string' }