SHA256 is faster than MD5

#performance#python#cryptography

For years I’ve used MD5 for file integrity checks in test scripts, assuming it was faster due to being a simpler algorithm. Due to advancements in modern hardware with built-in cryptographic functions, I decided to test if this still holds true.

I benchmarked MD5 vs SHA256 across different input sizes using Python’s hashlib on a Macbook M2 Max:

Small strings (12 characters):

Large strings (1KB):

Files (1MB/50MB):

SHA256 consistently outperformed MD5 across all test cases. This is due to modern CPU hardware acceleration (Intel SHA extensions, ARM crypto acceleration, etc), optimized implementations in libraries like Python’s hashlib, and SHA256’s design mapping better to modern processor architectures.

The bonus: SHA256 also provides significantly better security than MD5, which has known vulnerabilities.

Benchmark code for reference
import hashlib
import timeit
import os
import random
import string

def generate_random_string(length):
    """Generate a random string of specified length."""
    return ''.join(random.choices(string.ascii_letters + string.digits, k=length))

def create_test_file(size_mb):
    """Create a test file of specified size in MB."""
    filename = f"test_{size_mb}mb.txt"
    with open(filename, 'wb') as f:
        f.write(os.urandom(size_mb * 1024 * 1024))
    return filename

def hash_md5(data):
    """Calculate MD5 hash of data."""
    return hashlib.md5(data).hexdigest()

def hash_sha256(data):
    """Calculate SHA256 hash of data."""
    return hashlib.sha256(data).hexdigest()

def benchmark_string_hashing():
    """Benchmark hashing algorithms on strings of different sizes."""
    # Small string (~12 characters)
    small_str = generate_random_string(12).encode()
    print("\nSmall String (12 characters):")
    print(f"MD5: {timeit.timeit(lambda: hash_md5(small_str), number=100000):.4f} seconds")
    print(f"SHA256: {timeit.timeit(lambda: hash_sha256(small_str), number=100000):.4f} seconds")

    # Larger string (~1KB)
    large_str = generate_random_string(1024).encode()
    print("\nLarge String (1KB):")
    print(f"MD5: {timeit.timeit(lambda: hash_md5(large_str), number=1000):.4f} seconds")
    print(f"SHA256: {timeit.timeit(lambda: hash_sha256(large_str), number=1000):.4f} seconds")

def benchmark_file_hashing():
    """Benchmark hashing algorithms on files of different sizes."""
    # Create test files
    file_1mb = create_test_file(1)
    file_50mb = create_test_file(50)

    try:
        # Test 1MB file
        print("\n1MB File:")
        with open(file_1mb, 'rb') as f:
            data = f.read()
            print(f"MD5: {timeit.timeit(lambda: hash_md5(data), number=100):.4f} seconds")
            print(f"SHA256: {timeit.timeit(lambda: hash_sha256(data), number=100):.4f} seconds")

        # Test 50MB file
        print("\n50MB File:")
        with open(file_50mb, 'rb') as f:
            data = f.read()
            print(f"MD5: {timeit.timeit(lambda: hash_md5(data), number=10):.4f} seconds")
            print(f"SHA256: {timeit.timeit(lambda: hash_sha256(data), number=10):.4f} seconds")

    finally:
        # Clean up test files
        os.remove(file_1mb)
        os.remove(file_50mb)

if __name__ == "__main__":
    print("Benchmarking Hash Algorithms")
    print("=" * 30)
    
    benchmark_string_hashing()
    benchmark_file_hashing()
Output
Benchmarking Hash Algorithms
==============================

Small String (12 characters):
MD5: 0.0525 seconds
SHA256: 0.0376 seconds

Large String (1KB):
MD5: 0.0019 seconds
SHA256: 0.0007 seconds

1MB File:
MD5: 0.1538 seconds
SHA256: 0.0407 seconds

50MB File:
MD5: 0.7612 seconds
SHA256: 0.2083 seconds

Old assumptions about algorithm complexity don’t always translate to real-world performance on modern hardware.

Top