Table Of Contents
- One DataFrame, Infinite Destinations
- Essential Export Formats
- Excel Export with Advanced Features
- JSON Export Options
- Database Export
- Specialized Export Formats
- Customized Export Functions
- Large Dataset Export Strategies
- Export Performance Comparison
- Export Best Practices and Tips
- Master Data Distribution
One DataFrame, Infinite Destinations
Your analysis is complete, but data sharing requires different formats for different audiences. Master pandas' export arsenal to deliver your insights in exactly the format your stakeholders need.
Essential Export Formats
import pandas as pd
import numpy as np
import json
from datetime import datetime
# Create comprehensive sample data
sample_data = pd.DataFrame({
'employee_id': [1001, 1002, 1003, 1004, 1005],
'name': ['Alice Johnson', 'Bob Smith', 'Charlie Brown', 'Diana Prince', 'Eve Wilson'],
'department': ['Engineering', 'Marketing', 'Engineering', 'HR', 'Finance'],
'salary': [85000, 75000, 95000, 65000, 80000],
'hire_date': pd.to_datetime(['2020-01-15', '2019-06-20', '2021-03-10', '2018-11-05', '2022-02-28']),
'performance_score': [4.2, 3.8, 4.5, 3.9, 4.1],
'bonus': [8500, 6000, 12000, 5200, 7500],
'is_remote': [True, False, True, False, True]
})
print("Sample DataFrame:")
print(sample_data)
print(f"Data types:\n{sample_data.dtypes}")
# CSV Export (most common)
print("\n=== CSV Export ===")
sample_data.to_csv('employees.csv', index=False)
print("✅ Exported to employees.csv")
# CSV with custom settings
sample_data.to_csv('employees_custom.csv',
index=False,
sep=';', # Semicolon separator
encoding='utf-8', # UTF-8 encoding
date_format='%Y-%m-%d', # Date format
float_format='%.2f') # Float precision
print("✅ Exported to employees_custom.csv with custom settings")
Excel Export with Advanced Features
# Excel export with multiple sheets and formatting
print("\n=== Excel Export ===")
# Basic Excel export
sample_data.to_excel('employees.xlsx', index=False, sheet_name='Employee_Data')
print("✅ Basic Excel export completed")
# Advanced Excel export with multiple sheets
with pd.ExcelWriter('comprehensive_report.xlsx', engine='openpyxl') as writer:
# Main data sheet
sample_data.to_excel(writer, sheet_name='Employees', index=False)
# Department summary
dept_summary = sample_data.groupby('department').agg({
'salary': ['mean', 'count'],
'performance_score': 'mean',
'bonus': 'sum'
}).round(2)
dept_summary.to_excel(writer, sheet_name='Department_Summary')
# Salary analysis
salary_stats = sample_data['salary'].describe()
salary_stats.to_frame('Salary_Statistics').to_excel(writer, sheet_name='Salary_Analysis')
print("✅ Multi-sheet Excel report created")
# Excel with conditional formatting (requires openpyxl)
try:
from openpyxl import load_workbook
from openpyxl.formatting.rule import ColorScaleRule
# Load workbook and apply formatting
wb = load_workbook('comprehensive_report.xlsx')
ws = wb['Employees']
# Add conditional formatting to salary column (assuming it's column D)
salary_range = f'D2:D{len(sample_data) + 1}'
color_scale = ColorScaleRule(start_type='min', start_color='FF6B6B',
mid_type='percentile', mid_value=50, mid_color='FFEB3B',
end_type='max', end_color='4CAF50')
ws.conditional_formatting.add(salary_range, color_scale)
wb.save('formatted_employees.xlsx')
print("✅ Excel with conditional formatting created")
except ImportError:
print("⚠️ openpyxl not available for advanced Excel formatting")
JSON Export Options
print("\n=== JSON Export ===")
# Basic JSON export
sample_data.to_json('employees.json', orient='records', indent=2)
print("✅ JSON export (records format) completed")
# Different JSON orientations
json_formats = {
'records': 'employees_records.json', # List of records
'index': 'employees_index.json', # Index as keys
'values': 'employees_values.json', # Just values
'columns': 'employees_columns.json', # Columns as keys
'split': 'employees_split.json' # Split format
}
for orient, filename in json_formats.items():
sample_data.to_json(filename, orient=orient, indent=2, date_format='iso')
print(f"✅ JSON export ({orient} format) to {filename}")
# Custom JSON with nested structure
def create_hierarchical_json(df):
"""Create hierarchical JSON structure"""
hierarchical = {}
for dept in df['department'].unique():
dept_data = df[df['department'] == dept]
hierarchical[dept] = {
'employee_count': len(dept_data),
'average_salary': float(dept_data['salary'].mean()),
'total_bonus': float(dept_data['bonus'].sum()),
'employees': dept_data[['name', 'salary', 'performance_score']].to_dict('records')
}
return hierarchical
hierarchical_data = create_hierarchical_json(sample_data)
with open('hierarchical_employees.json', 'w') as f:
json.dump(hierarchical_data, f, indent=2, default=str)
print("✅ Hierarchical JSON structure created")
Database Export
print("\n=== Database Export ===")
# SQLite export (built into pandas)
import sqlite3
# Create SQLite database
sample_data.to_sql('employees',
con=sqlite3.connect('company.db'),
if_exists='replace',
index=False)
print("✅ Data exported to SQLite database")
# Read back to verify
conn = sqlite3.connect('company.db')
verification = pd.read_sql('SELECT * FROM employees LIMIT 3', conn)
print("Database verification (first 3 rows):")
print(verification)
conn.close()
# PostgreSQL export example (commented out - requires psycopg2)
"""
from sqlalchemy import create_engine
# Create PostgreSQL connection
engine = create_engine('postgresql://username:password@localhost:5432/database')
# Export to PostgreSQL
sample_data.to_sql('employees',
engine,
if_exists='replace',
index=False,
method='multi') # Faster bulk insert
"""
# MySQL export example (commented out - requires PyMySQL)
"""
import pymysql
from sqlalchemy import create_engine
# Create MySQL connection
engine = create_engine('mysql+pymysql://user:password@localhost/database')
sample_data.to_sql('employees',
engine,
if_exists='replace',
index=False)
"""
Specialized Export Formats
print("\n=== Specialized Export Formats ===")
# Parquet format (efficient for large datasets)
try:
sample_data.to_parquet('employees.parquet', compression='snappy')
print("✅ Parquet export completed")
# Verify parquet file
parquet_verify = pd.read_parquet('employees.parquet')
print(f"Parquet verification: {parquet_verify.shape}")
except ImportError:
print("⚠️ PyArrow/fastparquet not available for Parquet export")
# Pickle format (preserves all pandas features)
sample_data.to_pickle('employees.pkl')
print("✅ Pickle export completed")
# HDF5 format (hierarchical data)
try:
sample_data.to_hdf('employees.h5', key='data', mode='w', complevel=9)
print("✅ HDF5 export completed")
except ImportError:
print("⚠️ PyTables not available for HDF5 export")
# Feather format (fast I/O with R compatibility)
try:
sample_data.to_feather('employees.feather')
print("✅ Feather export completed")
except ImportError:
print("⚠️ PyArrow not available for Feather export")
# ORC format (optimized for big data)
try:
sample_data.to_orc('employees.orc')
print("✅ ORC export completed")
except ImportError:
print("⚠️ PyArrow not available for ORC export")
Customized Export Functions
def export_with_metadata(df, base_filename, formats=['csv', 'excel', 'json']):
"""Export DataFrame with metadata and multiple formats"""
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
# Create metadata
metadata = {
'export_timestamp': timestamp,
'row_count': len(df),
'column_count': len(df.columns),
'columns': list(df.columns),
'data_types': {col: str(dtype) for col, dtype in df.dtypes.items()},
'missing_values': df.isnull().sum().to_dict(),
'memory_usage': df.memory_usage(deep=True).sum(),
'summary_stats': df.describe(include='all').to_dict()
}
# Save metadata
with open(f'{base_filename}_metadata_{timestamp}.json', 'w') as f:
json.dump(metadata, f, indent=2, default=str)
export_log = []
# Export in requested formats
for fmt in formats:
filename = f'{base_filename}_{timestamp}.{fmt}'
try:
if fmt == 'csv':
df.to_csv(filename, index=False)
elif fmt == 'excel':
df.to_excel(filename, index=False)
elif fmt == 'json':
df.to_json(filename, orient='records', indent=2, date_format='iso')
elif fmt == 'parquet':
df.to_parquet(filename)
elif fmt == 'pickle':
df.to_pickle(filename)
export_log.append(f"✅ {filename}")
except Exception as e:
export_log.append(f"❌ {filename}: {str(e)}")
return export_log, metadata
# Test comprehensive export
export_results, export_metadata = export_with_metadata(
sample_data,
'employee_export',
formats=['csv', 'excel', 'json']
)
print("\nComprehensive export results:")
for result in export_results:
print(result)
print(f"\nExport metadata saved. Total memory usage: {export_metadata['memory_usage']:,} bytes")
Large Dataset Export Strategies
def export_large_dataset(df, filename, format_type='csv', chunk_size=10000):
"""Export large datasets in chunks to manage memory"""
if len(df) <= chunk_size:
# Small dataset - export normally
if format_type == 'csv':
df.to_csv(filename, index=False)
elif format_type == 'json':
df.to_json(filename, orient='records', indent=2)
return f"Small dataset exported directly to {filename}"
# Large dataset - chunked export
print(f"Large dataset detected ({len(df):,} rows). Exporting in chunks...")
if format_type == 'csv':
# CSV chunked export
header_written = False
for start in range(0, len(df), chunk_size):
chunk = df.iloc[start:start + chunk_size]
mode = 'w' if not header_written else 'a'
chunk.to_csv(filename, mode=mode, header=not header_written, index=False)
header_written = True
print(f" Exported rows {start:,} to {start + len(chunk):,}")
elif format_type == 'json':
# JSON chunked export (as JSON Lines)
with open(filename, 'w') as f:
for start in range(0, len(df), chunk_size):
chunk = df.iloc[start:start + chunk_size]
for _, row in chunk.iterrows():
json.dump(row.to_dict(), f, default=str)
f.write('\n')
print(f" Exported rows {start:,} to {start + len(chunk):,}")
return f"Large dataset exported in chunks to {filename}"
# Simulate large dataset
large_data = pd.concat([sample_data] * 2000, ignore_index=True) # 10,000 rows
large_data['row_id'] = range(len(large_data))
print(f"\nLarge dataset created: {len(large_data):,} rows")
# Test chunked export
chunk_result = export_large_dataset(large_data, 'large_employees.csv', 'csv', chunk_size=3000)
print(chunk_result)
Export Performance Comparison
import time
import os
def compare_export_performance(df, formats=['csv', 'excel', 'json', 'pickle']):
"""Compare export performance across formats"""
results = {}
for fmt in formats:
filename = f'performance_test.{fmt}'
try:
start_time = time.time()
if fmt == 'csv':
df.to_csv(filename, index=False)
elif fmt == 'excel':
df.to_excel(filename, index=False)
elif fmt == 'json':
df.to_json(filename, orient='records')
elif fmt == 'pickle':
df.to_pickle(filename)
elif fmt == 'parquet':
df.to_parquet(filename)
export_time = time.time() - start_time
file_size = os.path.getsize(filename) / 1024 # KB
results[fmt] = {
'export_time': export_time,
'file_size_kb': file_size,
'speed_kb_per_sec': file_size / export_time if export_time > 0 else 0
}
# Clean up
os.remove(filename)
except Exception as e:
results[fmt] = {'error': str(e)}
return results
# Performance test
performance_results = compare_export_performance(large_data)
print("\n=== Export Performance Comparison ===")
print(f"Dataset size: {len(large_data):,} rows, {large_data.memory_usage(deep=True).sum()/1024:.1f} KB in memory")
performance_df = pd.DataFrame(performance_results).T
if 'error' not in performance_df.columns:
performance_df = performance_df.sort_values('export_time')
print("\nPerformance results (sorted by speed):")
print(performance_df.round(3))
else:
print("\nPerformance results:")
print(performance_df)
Export Best Practices and Tips
def export_best_practices():
"""Best practices for DataFrame exports"""
practices = {
'Format Selection': [
"CSV: Universal compatibility, human-readable",
"Excel: Business users, multiple sheets, formatting",
"JSON: Web APIs, nested structures, JavaScript",
"Parquet: Big data, compression, schema preservation",
"Pickle: Python-only, preserves exact DataFrame state"
],
'Performance Optimization': [
"Use chunking for large datasets (>100MB)",
"Choose appropriate compression (gzip, snappy)",
"Consider columnar formats (Parquet, ORC) for analytics",
"Use categorical data types before export",
"Remove unnecessary columns before export"
],
'Data Integrity': [
"Specify date formats explicitly",
"Handle missing values appropriately",
"Preserve data types with metadata",
"Use UTF-8 encoding for international characters",
"Validate exports with sample reads"
],
'Collaboration': [
"Include metadata and documentation",
"Use consistent naming conventions",
"Add timestamps to avoid overwrites",
"Consider your audience's technical capabilities",
"Provide data dictionaries for complex datasets"
]
}
print("=== EXPORT BEST PRACTICES ===")
for category, tips in practices.items():
print(f"\n{category}:")
for tip in tips:
print(f" ✅ {tip}")
export_best_practices()
# Export validation function
def validate_export(original_df, exported_file, file_format='csv'):
"""Validate that export preserved data integrity"""
try:
if file_format == 'csv':
imported_df = pd.read_csv(exported_file)
elif file_format == 'excel':
imported_df = pd.read_excel(exported_file)
elif file_format == 'json':
imported_df = pd.read_json(exported_file)
elif file_format == 'pickle':
imported_df = pd.read_pickle(exported_file)
elif file_format == 'parquet':
imported_df = pd.read_parquet(exported_file)
validation_results = {
'shape_match': original_df.shape == imported_df.shape,
'columns_match': list(original_df.columns) == list(imported_df.columns),
'row_count_match': len(original_df) == len(imported_df),
'data_types_preserved': original_df.dtypes.equals(imported_df.dtypes)
}
# Check data integrity (for numeric columns)
numeric_cols = original_df.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 0:
numeric_match = True
for col in numeric_cols:
if not np.allclose(original_df[col].fillna(0), imported_df[col].fillna(0), equal_nan=True):
numeric_match = False
break
validation_results['numeric_data_match'] = numeric_match
return validation_results
except Exception as e:
return {'error': str(e)}
# Test export validation
sample_data.to_csv('validation_test.csv', index=False)
validation_result = validate_export(sample_data, 'validation_test.csv', 'csv')
print(f"\nExport validation result:")
for check, result in validation_result.items():
status = "✅" if result else "❌"
print(f"{status} {check}: {result}")
Master Data Distribution
Explore cloud storage integration patterns, learn automated reporting systems, and discover data pipeline automation strategies.
Share this article
Add Comment
No comments yet. Be the first to comment!