๐ง Building a Korean Character Recognition Model with PyTorch
Hangul, the Korean writing system, is both elegant and systematic. Each character is a combination of components called Jamo: ์ด์ฑ (initial), ์ค์ฑ (medial), and ์ข ์ฑ (final). In this project, I built a deep learning model using PyTorch to recognize these components from rendered character images. Here's how I did it.
✨ Project Overview
This project involves:
-
Generating synthetic images of Hangul characters with their bounding boxes.
-
Decomposing characters into Jamo components.
-
Training a convolutional neural network to classify each character's ์ด์ฑ, ์ค์ฑ, and ์ข ์ฑ.
-
Performing inference on single or multiple images and visualizing the results.
Let’s break it down.
๐️ Step 1: Generate Hangul Character Dataset
We use a TrueType font (like Malgun Gothic
) to render images of Hangul syllables and compute their bounding boxes. Each image is saved along with Jamo annotations in JSON.
generate_korean_chars.py
from PIL import Image, ImageDraw, ImageFontimport osimport jsonfrom typing import Tuple, Dictimport numpy as np
def find_char_bounds(img_array): """Find the actual character boundaries in the image""" # Convert RGB to grayscale and then to binary (black and white) if len(img_array.shape) == 3: gray = np.mean(img_array, axis=2) else: gray = img_array # Find non-white pixels (assuming white background) y_nonzero, x_nonzero = np.nonzero(gray < 245) # Allow some tolerance for anti-aliasing if len(x_nonzero) == 0 or len(y_nonzero) == 0: return None # Get bounds left = int(np.min(x_nonzero)) right = int(np.max(x_nonzero)) top = int(np.min(y_nonzero)) bottom = int(np.max(y_nonzero)) return left, top, right, bottom
def decompose_hangul(char: str) -> Tuple[str, str, str]: """Decompose a Hangul character into its Jamo components (์ด์ฑ, ์ค์ฑ, ์ข
์ฑ)""" if len(char) != 1: raise ValueError("Input must be a single character") # Unicode values for Hangul characters HANGUL_BASE = 0xAC00 HANGUL_END = 0xD7A3 JAMO_START = 0x1100 # Jamo arrays CHOSUNG = ['ใฑ', 'ใฒ', 'ใด', 'ใท', 'ใธ', 'ใน', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
'] JUNGSUNG = ['ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
ก', 'ใ
ข', 'ใ
ฃ'] JONGSUNG = [''] + ['ใฑ', 'ใฒ', 'ใณ', 'ใด', 'ใต', 'ใถ', 'ใท', 'ใน', 'ใบ', 'ใป', 'ใผ', 'ใฝ', 'ใพ', 'ใฟ', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
'] # Check if the character is Hangul code = ord(char) if code < HANGUL_BASE or code > HANGUL_END: raise ValueError("Input must be a Hangul character") # Decompose relative_code = code - HANGUL_BASE cho_idx = relative_code // (21 * 28) jung_idx = (relative_code % (21 * 28)) // 28 jong_idx = relative_code % 28 return CHOSUNG[cho_idx], JUNGSUNG[jung_idx], JONGSUNG[jong_idx]
def generate_char_image(char: str, font_path: str, size: int = 64, bg_color: str = "white", fg_color: str = "black") -> Tuple[Image.Image, Tuple[int, int, int, int]]: """Generate an image of a character using the specified font and return its bounding box""" # Create new image with given size img = Image.new('RGB', (size, size), bg_color) draw = ImageDraw.Draw(img) # Load font with smaller size to prevent cutoff try: # Reduce font size to 70% of image size font = ImageFont.truetype(font_path, size=int(size * 0.7)) except OSError: raise ValueError(f"Could not load font from {font_path}") # Get character size for centering bbox = draw.textbbox((0, 0), char, font=font) w = bbox[2] - bbox[0] h = bbox[3] - bbox[1] # Calculate position to center the character x = (size - w) / 2 - bbox[0] # Adjust for any left-side bearing y = (size - h) / 2 - bbox[1] # Adjust for any top-side bearing # Draw character at centered position draw.text((x, y), char, font=font, fill=fg_color) # Find actual character bounds bounds = find_char_bounds(np.array(img)) if bounds is None: bounds = (0, 0, size, size) return img, bounds
def save_char_data(char: str, image_path: str, bounds: Tuple[int, int, int, int], output_dir: str, annotations: Dict) -> None: """Save character data in COCO format with bounding box""" try: cho, jung, jong = decompose_hangul(char) # Add padding to bounds padding = 2 left, top, right, bottom = bounds padded_bounds = [ max(0, left - padding), max(0, top - padding), min(64, right + padding), min(64, bottom + padding) ] annotation = { "character": char, "jamo": { "์ด์ฑ": cho, "์ค์ฑ": jung, "์ข
์ฑ": jong }, "bbox": { "left": padded_bounds[0], "top": padded_bounds[1], "right": padded_bounds[2], "bottom": padded_bounds[3] } } annotations[image_path] = annotation except ValueError as e: print(f"Error processing character {char}: {e}")
def main(): # Configuration output_dir = "generated_chars" font_path = "C:/Windows/Fonts/malgun.ttf" # Default Windows Korean font image_size = 64 # Create output directory os.makedirs(output_dir, exist_ok=True) # Dictionary to store annotations annotations = {} # Generate basic Hangul syllables for code in range(0xAC00, 0xD7A4): # Range for all possible Hangul syllables char = chr(code) try: # Generate image and get bounding box img, bounds = generate_char_image(char, font_path, image_size) image_path = os.path.join(output_dir, f"{code:X}.png") img.save(image_path) # Save annotation with bounding box save_char_data(char, image_path, bounds, output_dir, annotations) if code % 100 == 0: # Progress indicator print(f"Generated {code - 0xAC00} characters...") except Exception as e: print(f"Error generating character {char} (code {code:X}): {e}") # Save annotations to JSON file with open(os.path.join(output_dir, "annotations.json"), "w", encoding="utf-8") as f: json.dump(annotations, f, ensure_ascii=False, indent=2)
if __name__ == "__main__": main()
Run it using:
python generate_korean_chars.py
It will create:
-
PNG images of all Hangul syllables
-
A JSON file (
annotations.json
) storing character metadata
๐ฆ Step 2: Prepare the Dataset and Model
The dataset is loaded using torch.utils.data.Dataset
. We normalize and convert the image to tensors, and extract target labels for each Jamo component.
korean_model.py
import torchimport torch.nn as nnimport torch.optim as optimfrom torch.utils.data import Dataset, DataLoaderfrom torchvision import transformsimport osimport jsonfrom PIL import Imageimport numpy as npfrom typing import Dict, List, Tuple
# Constants for JamoCHOSUNG = ['ใฑ', 'ใฒ', 'ใด', 'ใท', 'ใธ', 'ใน', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
']JUNGSUNG = ['ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
ก', 'ใ
ข', 'ใ
ฃ']JONGSUNG = [''] + ['ใฑ', 'ใฒ', 'ใณ', 'ใด', 'ใต', 'ใถ', 'ใท', 'ใน', 'ใบ', 'ใป', 'ใผ', 'ใฝ', 'ใพ', 'ใฟ', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
', 'ใ
']
class KoreanCharDataset(Dataset): def __init__(self, data_dir: str, transform=None): self.data_dir = data_dir self.transform = transform # Load annotations with open(os.path.join(data_dir, "annotations.json"), "r", encoding="utf-8") as f: self.annotations = json.load(f) self.image_files = [f for f in os.listdir(data_dir) if f.endswith('.png')] def __len__(self) -> int: return len(self.image_files) def __getitem__(self, idx: int) -> Tuple[torch.Tensor, Dict[str, int]]: img_file = self.image_files[idx] img_path = os.path.join(self.data_dir, img_file) # Load image image = Image.open(img_path).convert('RGB') # Get annotations annotation = self.annotations[img_path] jamo = annotation["jamo"] # Convert Jamo to indices cho_idx = CHOSUNG.index(jamo["์ด์ฑ"]) jung_idx = JUNGSUNG.index(jamo["์ค์ฑ"]) jong_idx = JONGSUNG.index(jamo["์ข
์ฑ"]) # Apply transforms if self.transform: image = self.transform(image) # Create target dictionary target = { "cho": torch.tensor(cho_idx, dtype=torch.long), "jung": torch.tensor(jung_idx, dtype=torch.long), "jong": torch.tensor(jong_idx, dtype=torch.long) } return image, target
class KoreanCharClassifier(nn.Module): def __init__(self): super(KoreanCharClassifier, self).__init__() # CNN Feature Extractor self.features = nn.Sequential( nn.Conv2d(3, 32, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(2), nn.Conv2d(32, 64, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(2), nn.Conv2d(64, 128, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(2), nn.Conv2d(128, 256, kernel_size=3, padding=1), nn.ReLU(), nn.AdaptiveAvgPool2d((1, 1)) ) # Separate classifiers for each Jamo component self.cho_classifier = nn.Linear(256, len(CHOSUNG)) self.jung_classifier = nn.Linear(256, len(JUNGSUNG)) self.jong_classifier = nn.Linear(256, len(JONGSUNG)) def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: x = self.features(x) x = x.view(x.size(0), -1) cho_out = self.cho_classifier(x) jung_out = self.jung_classifier(x) jong_out = self.jong_classifier(x) return cho_out, jung_out, jong_out
def train_model(model: nn.Module, train_loader: DataLoader, val_loader: DataLoader, num_epochs: int, device: torch.device) -> None: # Loss and optimizer criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters()) # Training loop for epoch in range(num_epochs): model.train() train_loss = 0.0 for images, targets in train_loader: images = images.to(device) cho_target = targets["cho"].to(device) jung_target = targets["jung"].to(device) jong_target = targets["jong"].to(device) # Forward pass cho_out, jung_out, jong_out = model(images) # Calculate loss loss = (criterion(cho_out, cho_target) + criterion(jung_out, jung_target) + criterion(jong_out, jong_target)) # Backward pass and optimize optimizer.zero_grad() loss.backward() optimizer.step() train_loss += loss.item() # Validation model.eval() val_loss = 0.0 correct = {"cho": 0, "jung": 0, "jong": 0} total = 0 with torch.no_grad(): for images, targets in val_loader: images = images.to(device) cho_target = targets["cho"].to(device) jung_target = targets["jung"].to(device) jong_target = targets["jong"].to(device) cho_out, jung_out, jong_out = model(images) # Calculate validation loss loss = (criterion(cho_out, cho_target) + criterion(jung_out, jung_target) + criterion(jong_out, jong_target)) val_loss += loss.item() # Calculate accuracy _, cho_pred = torch.max(cho_out, 1) _, jung_pred = torch.max(jung_out, 1) _, jong_pred = torch.max(jong_out, 1) total += cho_target.size(0) correct["cho"] += (cho_pred == cho_target).sum().item() correct["jung"] += (jung_pred == jung_target).sum().item() correct["jong"] += (jong_pred == jong_target).sum().item() # Print epoch statistics print(f'Epoch [{epoch+1}/{num_epochs}]') print(f'Train Loss: {train_loss/len(train_loader):.4f}') print(f'Val Loss: {val_loss/len(val_loader):.4f}') print(f'Accuracies: ์ด์ฑ={correct["cho"]/total:.2%}, ' f'์ค์ฑ={correct["jung"]/total:.2%}, ' f'์ข
์ฑ={correct["jong"]/total:.2%}') print('-' * 60)
def main(): # Set device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Using device: {device}") # Data transforms transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) ]) # Create dataset dataset = KoreanCharDataset("generated_chars", transform=transform) # Split dataset train_size = int(0.8 * len(dataset)) val_size = len(dataset) - train_size train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size]) # Create data loaders train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False) # Create model model = KoreanCharClassifier().to(device) # Train model train_model(model, train_loader, val_loader, num_epochs=10, device=device) # Save model torch.save(model.state_dict(), "korean_classifier.pth") print("Model saved successfully!")
if __name__ == "__main__": main()
This file defines:
-
KoreanCharDataset
: for loading images and labels -
KoreanCharClassifier
: a CNN with three output heads for ์ด์ฑ, ์ค์ฑ, and ์ข ์ฑ -
train_model
: trains and evaluates the model over epochs
To run the training:
python korean_model.py
This will:
-
Split the dataset into training and validation sets
-
Train the model and print accuracy for each Jamo
-
Save the model to
korean_classifier.pth
๐ Step 3: Predict and Visualize
The inference.py
script allows prediction on either a single image or a batch of randomly selected characters.
inference.py
import torchfrom PIL import Imageimport matplotlib.pyplot as pltfrom torchvision import transformsfrom korean_model import KoreanCharClassifier, CHOSUNG, JUNGSUNG, JONGSUNGimport matplotlib.font_manager as fmimport osimport random
def load_model(model_path: str): """Load the trained model""" device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = KoreanCharClassifier().to(device) model.load_state_dict(torch.load(model_path)) model.eval() return model, device
def process_image(image_path: str, device: torch.device): """Load and preprocess the image""" transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) ]) image = Image.open(image_path).convert('RGB') image_tensor = transform(image).unsqueeze(0).to(device) return image, image_tensor
def predict_single_image(model, device, image_path: str): """Make prediction for a single image""" image, image_tensor = process_image(image_path, device) with torch.no_grad(): cho_out, jung_out, jong_out = model(image_tensor) # Get predicted indices _, cho_pred = torch.max(cho_out, 1) _, jung_pred = torch.max(jung_out, 1) _, jong_pred = torch.max(jong_out, 1) # Get predicted Jamo characters cho = CHOSUNG[cho_pred.item()] jung = JUNGSUNG[jung_pred.item()] jong = JONGSUNG[jong_pred.item()] return image, cho, jung, jong
def predict_and_display(model_path: str, image_path: str = None): """Load model, make prediction, and display results""" # Load model model, device = load_model(model_path) # Set up Korean font korean_fonts = [f for f in fm.findSystemFonts() if 'malgun' in f.lower() or 'gulim' in f.lower()] if korean_fonts: plt.rcParams['font.family'] = fm.FontProperties(fname=korean_fonts[0]).get_name() if image_path and os.path.isfile(image_path): # Single image mode image, cho, jung, jong = predict_single_image(model, device, image_path) # Create visualization fig = plt.figure(figsize=(12, 5)) gs = fig.add_gridspec(1, 2, width_ratios=[1, 1]) # Left subplot - Original image ax1 = fig.add_subplot(gs[0]) ax1.imshow(image, cmap='gray') ax1.set_title('Input Character', pad=20, fontsize=14) ax1.axis('off') # Right subplot - Predictions ax2 = fig.add_subplot(gs[1]) ax2.set_title('Predicted Jamo Components', pad=20, fontsize=14) # Create a table for predictions cell_text = [[cho], [jung], [jong]] row_labels = ['์ด์ฑ', '์ค์ฑ', '์ข
์ฑ'] table = ax2.table(cellText=cell_text, rowLabels=row_labels, colWidths=[0.5], loc='center', cellLoc='center') # Style the table table.auto_set_font_size(False) table.set_fontsize(12) table.scale(1.5, 2) ax2.axis('off') else: # Batch mode - process 20 random images from generated_chars image_dir = "generated_chars" if not os.path.isdir(image_dir): print(f"Error: Directory '{image_dir}' not found!") return # Get list of PNG files image_files = [f for f in os.listdir(image_dir) if f.endswith('.png')] if len(image_files) > 20: image_files = random.sample(image_files, 20) # Calculate grid dimensions n_images = len(image_files) n_cols = 5 n_rows = (n_images + n_cols - 1) // n_cols # Create figure fig = plt.figure(figsize=(20, 4*n_rows)) # Process each image for idx, img_file in enumerate(image_files): img_path = os.path.join(image_dir, img_file) image, cho, jung, jong = predict_single_image(model, device, img_path) # Create subplot ax = fig.add_subplot(n_rows, n_cols, idx + 1) ax.imshow(image, cmap='gray') ax.set_title(f'Character: {img_file[:-4]}\n์ด์ฑ: {cho}, ์ค์ฑ: {jung}, ์ข
์ฑ: {jong}', fontsize=10, pad=10) ax.axis('off') # Print predictions print(f"\nPredictions for {img_file}:") print(f"์ด์ฑ: {cho}") print(f"์ค์ฑ: {jung}") print(f"์ข
์ฑ: {jong}") print("-" * 30) plt.tight_layout() plt.show()
if __name__ == "__main__": import sys if len(sys.argv) == 2: # Only model path provided - do batch processing model_path = sys.argv[1] predict_and_display(model_path) elif len(sys.argv) == 3: # Both model path and image path provided model_path = sys.argv[1] image_path = sys.argv[2] predict_and_display(model_path, image_path) else: print("Usage:") print("1. Single image: python inference.py <model_path> <image_path>") print("2. Batch processing: python inference.py <model_path>") print("Example:") print("1. python inference.py korean_classifier.pth generated_chars/AC00.png") print("2. python inference.py korean_classifier.pth") sys.exit(1)
Run Inference
-
Predict a single image:
python inference.py korean_classifier.pth generated_chars/AC00.png
-
Predict a batch of 20 random images:
python inference.py korean_classifier.pth
You’ll see the predictions in the terminal and also get a matplotlib visualization showing the input and the predicted ์ด์ฑ, ์ค์ฑ, ์ข ์ฑ.
๐งช Results
The model performs quite well, with separate accuracy metrics printed after each epoch. Thanks to the systematic structure of Hangul and the clean synthetic data, the model generalizes effectively to the full range of characters.
๐ ️ Tools & Libraries
-
Python 3.8+
-
PyTorch
-
PIL (Pillow)
-
Matplotlib
-
NumPy
๐ Folder Structure
project/
├── generate_korean_chars.py
├── korean_model.py
├── inference.py
├── generated_chars/
│ ├── AC00.png
│ ├── ...
│ └── annotations.json
├── korean_classifier.pth
๐ฑ Final Thoughts
This project showcases how we can combine Korean linguistic structure with deep learning to build a character-level recognition model. It can be extended further by:
-
Adding handwritten data
-
Using pre-trained backbones
-
Incorporating OCR pipelines
If you found this helpful or want to collaborate on multilingual AI or OCR projects, feel free to connect!
Happy coding! ๐
Comments
Post a Comment