Normalize converter labels and fix card grid sizing

This commit is contained in:
Codex Agent
2026-03-08 05:14:22 +00:00
parent cf74f06de0
commit 328b0ece6a
3 changed files with 76 additions and 1 deletions

View File

@@ -31,6 +31,62 @@ CATEGORY_KEYS = [
CATEGORY_SET = set(CATEGORY_KEYS)
# Lightweight label normalization to catch duplicate/identity conversions
# that differ only by abbreviations (e.g., "cm" vs "centimeters").
TOKEN_MAP = {
'cm': 'centimeter',
'centimeter': 'centimeter',
'centimetre': 'centimeter',
'centimetres': 'centimeter',
'centimeters': 'centimeter',
'mm': 'millimeter',
'millimeter': 'millimeter',
'millimeters': 'millimeter',
'millimetre': 'millimeter',
'millimetres': 'millimeter',
'm': 'meter',
'meter': 'meter',
'meters': 'meter',
'metre': 'meter',
'metres': 'meter',
'km': 'kilometer',
'kilometer': 'kilometer',
'kilometers': 'kilometer',
'kilometre': 'kilometer',
'kilometres': 'kilometer',
'in': 'inch',
'inch': 'inch',
'inches': 'inch',
'ft': 'foot',
'foot': 'foot',
'feet': 'foot',
}
def normalize_label(label: str) -> str:
"""Canonicalize a unit label for duplicate detection.
- Lowercase
- Replace '/' with ' per ' to align fraction style with text style
- Strip punctuation into tokens
- Collapse common abbreviations/plurals via TOKEN_MAP and simple singularization
"""
cleaned = label.lower().replace('/', ' per ')
tokens = re.split(r'[^a-z0-9]+', cleaned)
normalized_tokens = []
for tok in tokens:
if not tok:
continue
base = tok
# Drop a trailing 's' for simple plurals, but avoid short abbreviations like 'cms'
if base.endswith('s') and len(base) > 3:
base = base[:-1]
base = TOKEN_MAP.get(base, base)
normalized_tokens.append(base)
return ' '.join(normalized_tokens)
def load_external_descriptions():
# Placeholder for future enrichment sources.
return {}
@@ -92,6 +148,7 @@ def process():
calculators_ts_entries = []
seen_slugs = set()
seen_norm_pairs = set()
for raw_name, slug, category_raw, factor_raw in active_rows:
if raw_name == 'Calculator Name' or not slug:
continue
@@ -106,7 +163,21 @@ def process():
in1, in2 = "From", "To"
custom_labels = None
norm_in1 = normalize_label(in1)
norm_in2 = normalize_label(in2)
# Skip identity conversions that only differ by spelling/abbreviation
if norm_in1 == norm_in2:
print(f"Skipping identity converter {slug}: {in1} -> {in2}")
continue
pair_key = (norm_in1, norm_in2)
if pair_key in seen_norm_pairs:
print(f"Skipping duplicate converter {slug}: {in1} -> {in2}")
continue
seen_norm_pairs.add(pair_key)
category = normalize_category(category_raw)
if not category:
raise ValueError(f'Category required for {display_name}')