A Failed Experiment: Using OpenCV to Read Printed Dates from Scanned Photos
I recently purchased a scanner and was challenged to see if I could capture the shooting date from the images when digitizing old photos that have dates printed on them.
Organizing Old Photographs
I recently purchased a scanner at the end of the year to digitize old photographs. The photographs have dates printed on them, and I’ve been manually setting these dates in the EXIF information using ExifTool.
I use Hazel to organize image files, and I’ve already created a tool that updates file creation dates that Hazel can read with the shooting dates from EXIF data. However, since setting EXIF information manually is time-consuming, I explored the possibility of automating this process using OpenCV.
Date Reading Tool
Below is a script that uses OpenCV to read dates from images and set them using ExifTool.
#!/usr/bin/env python3
import os
import sys
import pytesseract
import cv2
import numpy as np
from datetime import datetime
import subprocess
def preprocess_image(image_path):
"""
Extract and process the date stamp portion from the image
"""
# Load image
img = cv2.imread(image_path)
height, width = img.shape[:2]
# Crop the bottom right region
x1 = int(width * 0.65)
y1 = int(height * 0.75)
cropped = img[y1:height, x1:width]
# Split channels
b, g, r = cv2.split(cropped)
# Enhance red channel
red_emphasis = cv2.subtract(r, cv2.max(b, g))
# Enhance contrast using CLAHE
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
enhanced = clahe.apply(red_emphasis)
# Strong thresholding
_, binary = cv2.threshold(enhanced, 30, 255, cv2.THRESH_BINARY)
# Remove noise
kernel = np.ones((2,2), np.uint8)
cleaned = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)
# Detect and filter contours
contours, _ = cv2.findContours(cleaned, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# Filter by area
mask = np.zeros_like(cleaned)
min_area = 10
max_area = 500
for contour in contours:
area = cv2.contourArea(contour)
if min_area < area < max_area:
cv2.drawContours(mask, [contour], -1, (255), -1)
# Resize to 4x
result = cv2.resize(mask, None, fx=4, fy=4, interpolation=cv2.INTER_CUBIC)
# Save debug images
debug_dir = os.path.join(os.path.dirname(image_path), 'debug')
os.makedirs(debug_dir, exist_ok=True)
basename = os.path.basename(image_path)
cv2.imwrite(os.path.join(debug_dir, f'cropped_{basename}'), cropped)
cv2.imwrite(os.path.join(debug_dir, f'red_emphasis_{basename}'), red_emphasis)
cv2.imwrite(os.path.join(debug_dir, f'enhanced_{basename}'), enhanced)
cv2.imwrite(os.path.join(debug_dir, f'binary_{basename}'), binary)
cv2.imwrite(os.path.join(debug_dir, f'result_{basename}'), result)
return result
def extract_date_from_image(image_path):
"""Extract date from image using OCR"""
# Preprocess image
processed_image = preprocess_image(image_path)
# OCR configuration for single line text
custom_configs = [
'--psm 6 -c tessedit_char_whitelist="0123456789\' "', # Uniform text block
'--psm 7 -c tessedit_char_whitelist="0123456789\' "', # Single text line
'--psm 8 -c tessedit_char_whitelist="0123456789\' "', # Single word
'--psm 10 -c tessedit_char_whitelist="0123456789\' "' # Single character
]
for config in custom_configs:
# Run OCR
text = pytesseract.image_to_string(processed_image, config=config)
text = text.strip()
print(f"OCR result (config={config}): [{text}]")
# Normalize spaces
text = ' '.join(text.split())
# Simple pattern matching
pattern = r"'?(\d{2})\s*(\d{1,2})\s*(\d{1,2})"
match = re.search(pattern, text)
if match:
try:
year = int(match.group(1))
month = int(match.group(2))
day = int(match.group(3))
# Handle 2000s
year = 2000 + year if year < 50 else 1900 + year
if 1 <= month <= 12 and 1 <= day <= 31:
return datetime(year, month, day)
except ValueError:
continue
return None
def set_exif_date(image_path, date):
"""Set EXIF date information using ExifTool"""
if not date:
print(f"No date found in: {image_path}")
return False
date_str = date.strftime("%Y:%m:%d")
# EXIF tags to set
exif_tags = [
"DateTimeOriginal",
"CreateDate",
"ModifyDate"
]
command = ["exiftool", "-overwrite_original_in_place"]
for tag in exif_tags:
command.extend([f"-{tag}={date_str}"])
command.append(image_path)
try:
subprocess.run(command, check=True, capture_output=True)
print(f"Date set successfully: {image_path} -> {date_str}")
return True
except subprocess.CalledProcessError as e:
print(f"Failed to set EXIF information: {e}")
return False
def process_images(directory):
"""Process all images in the specified directory"""
processed = 0
failed = 0
for filename in os.listdir(directory):
if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
image_path = os.path.join(directory, filename)
date = extract_date_from_image(image_path)
if set_exif_date(image_path, date):
processed += 1
else:
failed += 1
print(f"\nProcessing complete:")
print(f"Success: {processed}")
print(f"Failed: {failed}")
def main():
if len(sys.argv) != 2:
print("Usage: photodate <image_directory>")
sys.exit(1)
directory = sys.argv[1]
if not os.path.isdir(directory):
print("Specified path is not a valid directory")
sys.exit(1)
process_images(directory)
if __name__ == "__main__":
main()
Results
I tested the script on 27 scanned photographs.
Out of 27 images, the script only detected dates in 4 images, and among those, only 1 date was correctly recognized.
Well, this isn’t practical enough to use.