HowDoYouConvert/migrate.py

import json
import re
from pathlib import Path

BASE_DIR = Path(__file__).resolve().parent
CALCLIST = BASE_DIR / 'calculators_list.md'
OUTPUT_FILE = BASE_DIR / 'hdyc-svelte/src/lib/data/calculators.ts'

CATEGORY_KEYS = [
    'length',
    'weight',
    'temperature',
    'volume',
    'fluids',
    'area',
    'speed',
    'pressure',
    'energy',
    'magnetism',
    'power',
    'data',
    'time',
    'angle',
    'number-systems',
    'radiation',
    'electrical',
    'force',
    'light',
    'other',
]

CATEGORY_SET = set(CATEGORY_KEYS)

# Lightweight label normalization to catch duplicate/identity conversions
# that differ only by abbreviations (e.g., "cm" vs "centimeters").
TOKEN_MAP = {
    'cm': 'centimeter',
    'centimeter': 'centimeter',
    'centimetre': 'centimeter',
    'centimetres': 'centimeter',
    'centimeters': 'centimeter',
    'mm': 'millimeter',
    'millimeter': 'millimeter',
    'millimeters': 'millimeter',
    'millimetre': 'millimeter',
    'millimetres': 'millimeter',
    'm': 'meter',
    'meter': 'meter',
    'meters': 'meter',
    'metre': 'meter',
    'metres': 'meter',
    'km': 'kilometer',
    'kilometer': 'kilometer',
    'kilometers': 'kilometer',
    'kilometre': 'kilometer',
    'kilometres': 'kilometer',
    'in': 'inch',
    'inch': 'inch',
    'inches': 'inch',
    'ft': 'foot',
    'foot': 'foot',
    'feet': 'foot',
}


def normalize_label(label: str) -> str:
    """Canonicalize a unit label for duplicate detection.

    - Lowercase
    - Replace '/' with ' per ' to align fraction style with text style
    - Strip punctuation into tokens
    - Collapse common abbreviations/plurals via TOKEN_MAP and simple singularization
    """
    cleaned = label.lower().replace('/', ' per ')
    tokens = re.split(r'[^a-z0-9]+', cleaned)

    normalized_tokens = []
    for tok in tokens:
        if not tok:
            continue
        base = tok
        # Drop a trailing 's' for simple plurals, but avoid short abbreviations like 'cms'
        if base.endswith('s') and len(base) > 3:
            base = base[:-1]
        base = TOKEN_MAP.get(base, base)
        normalized_tokens.append(base)

    return ' '.join(normalized_tokens)

def load_external_descriptions():
    # Placeholder for future enrichment sources.
    return {}

def parse_calculators_list():
    active_calcs = []
    with open(CALCLIST, 'r') as f:
        lines = f.readlines()

    in_table = False
    header_map = {}
    for line in lines:
        if in_table and line.startswith('## '):
            break
        if '| Calculator Name' in line:
            in_table = True
            headers = [p.strip() for p in line.strip().strip('|').split('|')]
            header_map = {header: idx for idx, header in enumerate(headers)}
            continue
        if in_table and line.startswith('| :---'):
            continue
        if in_table and line.startswith('|'):
            parts = [p.strip() for p in line.strip().strip('|').split('|')]
            name_idx = header_map.get('Calculator Name')
            slug_idx = header_map.get('Slug')
            category_idx = header_map.get('Category')
            factor_idx = header_map.get('Conversion Factor')
            if None not in (name_idx, slug_idx, category_idx, factor_idx) and len(parts) > max(name_idx, slug_idx, category_idx, factor_idx):
                name = parts[name_idx]
                slug = parts[slug_idx]
                category = parts[category_idx]
                factor_raw = parts[factor_idx]
                active_calcs.append((name, slug, category, factor_raw))

    return active_calcs

def split_name_and_teaser(name):
    parts = re.split(r'\s[–—-]\s', name, maxsplit=1)
    if len(parts) == 2:
        return parts[0].strip(), parts[1].strip()
    return name.strip(), ''


def split_conversion_name(name):
    parts = re.split(r'\s+to\s+', name, maxsplit=1, flags=re.IGNORECASE)
    if len(parts) == 2:
        return parts[0].strip(), parts[1].strip()
    return None

def normalize_category(raw: str) -> str:
    normalized = raw.strip().lower().replace(' ', '-')
    normalized = re.sub(r'[^a-z0-9-]', '', normalized)
    return normalized

def process():
    external_descriptions = load_external_descriptions()
    active_rows = parse_calculators_list()

    calculators_ts_entries = []

    seen_slugs = set()
    seen_norm_pairs = set()
    for raw_name, slug, category_raw, factor_raw in active_rows:
        if raw_name == 'Calculator Name' or not slug:
            continue

        display_name, teaser = split_name_and_teaser(raw_name)

        # Name splitting
        parsed = split_conversion_name(display_name)
        if parsed:
            in1, in2 = parsed
        else:
            in1, in2 = "From", "To"

        custom_labels = None

        norm_in1 = normalize_label(in1)
        norm_in2 = normalize_label(in2)

        # Skip identity conversions that only differ by spelling/abbreviation
        if norm_in1 == norm_in2:
            print(f"Skipping identity converter {slug}: {in1} -> {in2}")
            continue

        pair_key = (norm_in1, norm_in2)
        if pair_key in seen_norm_pairs:
            print(f"Skipping duplicate converter {slug}: {in1} -> {in2}")
            continue
        seen_norm_pairs.add(pair_key)

        category = normalize_category(category_raw)
        if not category:
            raise ValueError(f'Category required for {display_name}')
        if category not in CATEGORY_SET:
            raise ValueError(f'Unknown category \"{category_raw}\" resolved to \"{category}\" for {slug}')
        if slug in seen_slugs:
            continue
        seen_slugs.add(slug)
        desc_html = external_descriptions.get(slug, "")

        c_type = 'standard'
        factor_val = "1"
        offset_val = "0"

        # Special-case calculator families that require custom math beyond simple factors.
        if 'molarity-to-grams-per-liter' == slug:
            c_type = 'molarity'
            custom_labels = {'in1': 'Molarity (mol/L)', 'in2': 'Grams per liter', 'in3': 'Molar mass (g/mol)'}
        elif 'rockwell-c-to-vickers' == slug:
            c_type = 'rockwell-vickers'
        elif 'ev-to-lux' in slug or 'lux-to-ev' in slug:
            c_type = 'ev-lux'
        elif 'focal-length-to-angle-of-view' in slug:
            c_type = 'aov'
        elif 'awg' in slug:
            c_type = 'awg'
        elif 'swg-to' in slug or '-to-swg' in slug:
            c_type = 'swg'
        elif 'brinell-to-rockwell-c' == slug or 'rockwell-c-to-brinell' == slug:
            c_type = 'brinell-rockwell'
        elif 'saybolt-universal-seconds-to-centistokes' == slug:
            c_type = 'sus-cst'
        elif '1/x' in factor_raw:
            c_type = 'inverse'
            factor_val = "1"
        elif 'Multi-Variable' in factor_raw:
            c_type = '3col'
            if 'amps' in slug and 'watts' in slug:
                # Based on hdyc-calculators.js, apps-to-watts is 3col-mul, watts-to-amps is 3col
                if slug == 'amps-to-watts': c_type = '3col-mul'
            if slug == 'lux-to-lumens': c_type = '3col-mul'
        elif 'Logarithmic' in factor_raw or 'Exponential' in factor_raw:
            if 'db-int' in slug or 'intensity' in slug: c_type = 'db-int'
            elif 'spl' in slug or 'sound' in slug: c_type = 'db-spl'
            elif 'volts' in slug: c_type = 'db-v'
            else: c_type = 'db-w'
        elif 'Base 60' in factor_raw:
            if slug == 'degrees-minutes-and-seconds-to-decimal-degrees': c_type = 'dms-dd'
            else: c_type = 'dd-dms'
        elif 'GCD' in factor_raw or 'string split' in factor_raw or 'fraction' in slug:
            c_type = 'dec-frac'
        elif 'N/A' in factor_raw or 'Text' in factor_raw:
            if 'ascii' in slug:
                c_type = 'text-bin' if slug.startswith('ascii') else 'bin-text'
            elif 'binary' in slug or 'hex' in slug or 'decimal' in slug or 'octal' in slug:
                c_type = 'base'
        elif '10_to_2' in factor_raw or '16_to_2' in factor_raw or '10_to_16' in factor_raw or 'base' in factor_raw.lower():
            c_type = 'base'
        elif 'Linear Offset' in factor_raw:
            # "Linear Offset (1.8x + 32)"
            m = re.search(r'Linear Offset \(([-\d\./]+)x\s*([+-]\s*[\d\.]+)\)', factor_raw)
            if m:
                f_v = m.group(1)
                # handle frac
                if '/' in f_v: f_v = str(float(f_v.split('/')[0]) / float(f_v.split('/')[1]))
                o_v = m.group(2).replace(' ', '')
                factor_val = f_v
                offset_val = o_v
            else:
                m2 = re.search(r'Linear Offset \(([-\d\./]+)x\)', factor_raw)
                if m2: factor_val = m2.group(1)
            c_type = 'standard'
        else:
            try:
                # If it's a number
                float(factor_raw)
                factor_val = factor_raw
            except:
                pass

        # Give 3-col calculators honest display names instead of "A to B"
        if c_type in ['3col', '3col-mul'] and split_conversion_name(display_name):
            op = '*' if c_type == '3col-mul' else '/'
            display_name = f"{in1} {op} {in2}"

        # Avoid escaping single quotes by using JSON or dict
        entry = {
            'slug': slug,
            'name': display_name,
            'category': category,
            'type': c_type
        }
        if teaser:
            entry['teaser'] = teaser

        # Determine labels
        labels = {'in1': in1, 'in2': in2}
        if c_type in ['3col', '3col-mul']:
            # generic 3rd label; make it descriptive instead of the vague "Result"
            if 'watts' in slug and 'amps' in slug:
                labels['in3'] = 'Volts'
            elif 'lumens' in slug:
                labels['in3'] = 'Area (sq m)'
            elif 'moles' in slug:
                labels['in3'] = 'Molar Mass'
            else:
                op = '*' if c_type == '3col-mul' else '/'
                labels['in3'] = f"{in1} {op} {in2}"

        if custom_labels:
            labels = custom_labels

        entry['labels'] = labels

        if c_type == 'standard' and factor_val != "1":
            try: entry['factor'] = float(factor_val)
            except: pass
        if c_type == 'standard' and offset_val != "0":
            try: entry['offset'] = float(offset_val)
            except: pass

        if c_type == 'base':
            if 'binary' in slug:
                if slug.startswith('binary'): entry['fromBase'] = 2
                else: entry['toBase'] = 2
            if 'hex' in slug:
                if slug.startswith('hex'): entry['fromBase'] = 16
                else: entry['toBase'] = 16
            if 'octal' in slug:
                if slug.startswith('octal'): entry['fromBase'] = 8
                else: entry['toBase'] = 8
            if 'decimal' in slug:
                if slug.startswith('decimal'): entry['fromBase'] = 10
                else: entry['toBase'] = 10
            if 'base-' in slug:
                parts = slug.split('-')
                if len(parts) >= 5 and parts[0] == 'base' and parts[2] == 'to' and parts[3] == 'base':
                    try:
                        entry.setdefault('fromBase', int(parts[1]))
                        entry.setdefault('toBase', int(parts[4]))
                    except ValueError:
                        pass
            if 'base' in factor_raw.lower():
                match = re.search(r'base\\s*(\\d+)\\s*(?:→|to)\\s*(?:base\\s*)?(\\d+)', factor_raw, re.IGNORECASE)
                if match:
                    entry.setdefault('fromBase', int(match.group(1)))
                    entry.setdefault('toBase', int(match.group(2)))

        if category == 'data' and c_type == 'standard':
            # Fix data scale names and factors
            # Megabytes, Gigabytes, Terabytes etc should use decimal base-10 sizes (1000).
            # Mebibytes, Gibibytes, Tebibytes should use binary base-2 sizes (1024).

            # Create a simple mapping for names to exponents to calculate strict factors
            units_10 = {'byte': 0, 'kilobyte': 3, 'megabyte': 6, 'gigabyte': 9, 'terabyte': 12, 'petabyte': 15, 'exabyte': 18}
            units_2 = {'kibibyte': 10, 'mebibyte': 20, 'gibibyte': 30, 'tebibyte': 40, 'pebibyte': 50, 'exbibyte': 60}

            in1_key = in1.lower()
            if in1_key.endswith('s'): in1_key = in1_key[:-1]
            in2_key = in2.lower()
            if in2_key.endswith('s'): in2_key = in2_key[:-1]

            # Helper to get base and exp
            def get_val(k):
                if k in units_10: return 10, units_10[k]
                if k in units_2: return 2, units_2[k]
                if k == 'bit': return 10, -1 # placeholder relative to bytes, though bits are 1/8 byte. Handling simple bytes here only
                return None, None

            b1, e1 = get_val(in1_key)
            b2, e2 = get_val(in2_key)

            if b1 and b2 and b1 == b2 and b1 == 10:
                # Decimal to decimal
                factor = 10 ** (e1 - e2)
                factor_val = str(factor)
                try: entry['factor'] = float(factor_val)
                except: pass
            elif b1 and b2 and b1 == b2 and b1 == 2:
                # Binary to binary
                factor = 2 ** (e1 - e2)
                factor_val = str(factor)
                try: entry['factor'] = float(factor_val)
                except: pass
            elif b1 and b2:
                # Cross conversion
                val1 = (10 ** e1) if b1 == 10 else (2 ** e1)
                val2 = (10 ** e2) if b2 == 10 else (2 ** e2)
                factor = val1 / val2
                try: entry['factor'] = float(factor)
                except: pass

        # Remove empty descriptions
        if desc_html:
            entry['descriptionHTML'] = desc_html.replace('"', '\\"').replace('\n', '')

        calculators_ts_entries.append(entry)

    # Mark duplicates / reverse pairs
    # E.g. If "A to B" has a factor F, and "B to A" exists
    existing_slugs = [e['slug'] for e in calculators_ts_entries]
    for e in calculators_ts_entries:
        # Check if inverse exists. We hide the one with the smaller factor (usually < 1) or hide alphabetical later one.
        # But a better heuristic: reverse of split(' to ')
        if e.get('category') == 'data':
            continue
        parsed = split_conversion_name(e['name'])
        if parsed:
            rev_name = f"{parsed[1]} to {parsed[0]}"
            rev_slug = rev_name.lower().replace(' ', '-')
            if rev_slug in existing_slugs and e['slug'] != rev_slug:
                # hide one of them. We'll hide the one where factor < 1, or if both 1, arbitrarily
                if 'factor' in e and e['factor'] < 1.0:
                    e['hidden'] = True
                elif 'factor' not in e:
                    # just hide alphabetical later
                    if e['slug'] > rev_slug:
                        e['hidden'] = True

    # Ensure types are right
    # write to TS
    out = """// THIS FILE IS AUTO-GENERATED BY migrate.py
export type CalcType = 'standard' | 'inverse' | '3col' | '3col-mul' | 'base' | 'text-bin' | 'bin-text' | 'dms-dd' | 'dd-dms' | 'dec-frac' | 'db-int' | 'db-spl' | 'db-v' | 'db-w' | 'awg' | 'brinell-rockwell' | 'ev-lux' | 'aov' | 'swg' | 'rockwell-vickers' | 'sus-cst' | 'molarity';

export interface CalculatorDef {
  slug: string;
  name: string;
  category: string;
  type: CalcType;
  hidden?: boolean;
  factor?: number;
  offset?: number;
  fromBase?: number;
  toBase?: number;
  labels: { in1: string; in2: string; in3?: string };
  descriptionHTML?: string;
  teaser?: string;
}

export const categories: Record<string, { label: string; icon: string }> = {
  length:          { label: 'Length / Distance',   icon: '📏' },
  weight:          { label: 'Weight / Mass',       icon: '⚖️' },
  temperature:     { label: 'Temperature',         icon: '🌡️' },
  volume:          { label: 'Volume',              icon: '🧪' },
  fluids:          { label: 'Fluids',              icon: '💧' },
  area:            { label: 'Area',                icon: '📐' },
  speed:           { label: 'Speed / Velocity',    icon: '💨' },
  pressure:        { label: 'Pressure',            icon: '🔽' },
  energy:          { label: 'Energy',              icon: '⚡' },
  magnetism:       { label: 'Magnetism',           icon: '🧲' },
  power:           { label: 'Power',               icon: '🔌' },
  data:            { label: 'Data Storage',        icon: '💾' },
  time:            { label: 'Time',                icon: '⏱️' },
  angle:           { label: 'Angle',               icon: '📐' },
  'number-systems':{ label: 'Number Systems',      icon: '🔢' },
  radiation:       { label: 'Radiation',           icon: '☢️' },
  electrical:      { label: 'Electrical',          icon: '🔋' },
  force:           { label: 'Force / Torque',      icon: '💪' },
  light:           { label: 'Light',               icon: '💡' },
  other:           { label: 'Other',               icon: '🔄' },
};

export const calculators: CalculatorDef[] = [
"""
    for e in calculators_ts_entries:
        desc = e.pop('descriptionHTML', '')
        e_str = json.dumps(e)
        if desc:
            # manually inject descriptionHTML into json representation without double encoding html
            out += f"  {{...{e_str}, descriptionHTML: `{desc}`}},\n"
        else:
            out += f"  {e_str},\n"

    out += """
];

const slugIndex = new Map(calculators.map(c => [c.slug, c]));

export function getCalculatorBySlug(slug: string): CalculatorDef | undefined {
  return slugIndex.get(slug);
}

export function getCalculatorsByCategory(category: string): CalculatorDef[] {
  return calculators.filter(c => c.category === category && !c.hidden);
}

export function getCategoriesWithCounts(): { key: string; label: string; icon: string; count: number }[] {
  return Object.entries(categories).map(([key, meta]) => ({
    key,
    ...meta,
    count: calculators.filter(c => c.category === key && !c.hidden).length,
  }));
}

export function searchCalculators(query: string): CalculatorDef[] {
  const q = query.toLowerCase();
  return calculators.filter(c =>
    (c.name.toLowerCase().includes(q) ||
    c.slug.includes(q) ||
    c.labels.in1.toLowerCase().includes(q) ||
    c.labels.in2.toLowerCase().includes(q)) && !c.hidden
  );
}
"""
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        f.write(out)

    print(f"Generated {len(calculators_ts_entries)} calculators into calculators.ts")

if __name__ == '__main__':
    process()