HowDoYouConvert/migrate.py

import json
import re
from pathlib import Path

BASE_DIR = Path(__file__).resolve().parent
CALCLIST = BASE_DIR / 'calculators_list.md'
OUTPUT_FILE = BASE_DIR / 'hdyc-svelte/src/lib/data/calculators.ts'

FLUID_KEYWORDS = [
    'flow',
    'mass flux',
    'volumetric',
    'permeability',
    'viscosity',
    'kinematic',
    'surface tension',
    'molar',
    'concentration',
    'flux density',
    'flow rate',
    'gallon per',
    'gallons per',
    'liter per',
    'liters per',
    'cubic per',
    'cubic meter per',
    'cubic meters per',
    'cubic foot per',
    'cubic feet per',
    'cubic inch per',
    'cubic inches per',
    'kg per',
    'kilogram per',
    'kilograms per',
    'gram per',
    'grams per',
    'g per',
    'lb per',
    'lbs per',
    'pound per',
    'pounds per',
    'mole per',
    'moles per',
    'mol per',
    'mmol per',
    'percent by mass',
    'ppm',
    'heat transfer coefficient',
    'per square meter',
    'per square metre',
    'per square foot',
    'per square inch',
    'per square centimeter',
    'per square centimetre',
    'per cubic meter',
    'per cubic metre',
    'per cubic foot',
    'per cubic inch'
]

CURRENCY_KEYWORDS = ['currency', 'exchange rate', 'forex']

def load_external_descriptions():
    # Placeholder for future enrichment sources.
    return {}

def parse_calculators_list():
    active_calcs = []
    with open(CALCLIST, 'r') as f:
        lines = f.readlines()

    in_table = False
    header_map = {}
    for line in lines:
        if in_table and line.startswith('## '):
            break
        if '| Calculator Name' in line:
            in_table = True
            headers = [p.strip() for p in line.strip().strip('|').split('|')]
            header_map = {header: idx for idx, header in enumerate(headers)}
            continue
        if in_table and line.startswith('| :---'):
            continue
        if in_table and line.startswith('|'):
            parts = [p.strip() for p in line.strip().strip('|').split('|')]
            name_idx = header_map.get('Calculator Name')
            slug_idx = header_map.get('Slug')
            factor_idx = header_map.get('Conversion Factor')
            if None not in (name_idx, slug_idx, factor_idx) and len(parts) > max(name_idx, slug_idx, factor_idx):
                name = parts[name_idx]
                slug = parts[slug_idx]
                factor_raw = parts[factor_idx]
                active_calcs.append((name, slug, factor_raw))

    return active_calcs

def split_name_and_teaser(name):
    parts = re.split(r'\s[–—-]\s', name, maxsplit=1)
    if len(parts) == 2:
        return parts[0].strip(), parts[1].strip()
    return name.strip(), ''


def split_conversion_name(name):
    parts = re.split(r'\s+to\s+', name, maxsplit=1, flags=re.IGNORECASE)
    if len(parts) == 2:
        return parts[0].strip(), parts[1].strip()
    return None

def guess_category(name):
    name_l = name.lower()
    if any(keyword in name_l for keyword in CURRENCY_KEYWORDS):
        return 'currency'
    if any(keyword in name_l for keyword in FLUID_KEYWORDS):
        return 'fluids'
    if any(x in name_l for x in ['acre-foot', 'acre-feet', 'acrefoot', 'acre feet']):
        return 'volume'
    if 'temp scale' in name_l or 'newton (temp' in name_l:
        return 'temperature'
    if any(x in name_l for x in ['force', 'torque', 'newton', 'dyne', 'foot-pound']): return 'force'
    if any(x in name_l for x in ['acre', 'hectare', 'square']): return 'area'
    if any(x in name_l for x in ['meter', 'inch', 'feet', 'yard', 'mile', 'cable', 'fathom', 'rod', 'chain', 'nautical', 'league']): return 'length'
    if any(x in name_l for x in ['gram', 'pound', 'ounce', 'carat', 'stone', 'slug', 'ton', 'pennyweight', 'grain', 'momme']): return 'weight'
    if any(x in name_l for x in ['celsius', 'fahrenheit', 'kelvin', 'rankine', 'delisle', 'reaumur', 'réaumur', 'romer', 'rømer']): return 'temperature'
    if any(x in name_l for x in ['liter', 'gallon', 'cup', 'pint', 'quart', 'fluid', 'milliliter', 'spoon', 'drop']): return 'volume'
    if any(x in name_l for x in ['pascal', 'bar', 'psi', 'atmosphere', 'mmhg', 'torr', 'water', 'mercury']): return 'pressure'
    if any(x in name_l for x in ['magnet', 'magnetic', 'tesla', 'gauss', 'oersted', 'weber', 'maxwell', 'gilbert', 'ampere-turn', 'ampere turns', 'ampere per meter', 'magnetomotive']): return 'magnetism'
    if any(x in name_l for x in ['joule', 'calorie', 'btu', 'erg', 'therm', 'electron-volt']): return 'energy'
    if any(x in name_l for x in ['thermal', 'heat', 'conductivity', 'resistance', 'capacity', 'expansion', 'transfer']): return 'temperature'
    if any(x in name_l for x in ['watt', 'horsepower']): return 'power'
    if any(x in name_l for x in ['byte', 'bit', 'nibble', 'baud']): return 'data'
    if 'light' in name_l or any(x in name_l for x in ['lumen', 'lux', 'candela']): return 'light'
    if any(x in name_l for x in ['degree', 'degrees', 'radian', 'radians', 'arcminute', 'arcminutes', 'arcsecond', 'arcseconds', 'gradian', 'gradians', 'mil', 'mils', 'quadrant', 'quadrants', 'sextant', 'sextants', 'turn', 'turns', 'points (compass', 'points-compass']): return 'angle'
    if any(x in name_l for x in ['second', 'minute', 'hour', 'day', 'week', 'month', 'year']): return 'time'
    if any(x in name_l for x in ['binary', 'hex', 'octal', 'decimal', 'ascii', 'fraction']): return 'number-systems'
    if any(x in name_l for x in ['becquerel', 'curie', 'gray', 'rad', 'sievert', 'rem', 'roentgen', 'rutherford']): return 'radiation'
    if any(x in name_l for x in ['volt', 'amp', 'ohm', 'siemens', 'farad', 'henry', 'coulomb']): return 'electrical'
    if ' per ' in name_l or 'knot' in name_l or 'mach' in name_l or 'rpm' in name_l: return 'speed' # RPM might be frequency, close enough
    if any(x in name_l for x in ['binary', 'hex', 'octal', 'decimal', 'base']):
        return 'number-systems'
    return 'other'

def process():
    external_descriptions = load_external_descriptions()
    active_rows = parse_calculators_list()

    calculators_ts_entries = []

    for raw_name, slug, factor_raw in active_rows:
        if raw_name == 'Calculator Name' or not slug:
            continue

        display_name, teaser = split_name_and_teaser(raw_name)

        # Name splitting
        parsed = split_conversion_name(display_name)
        if parsed:
            in1, in2 = parsed
        else:
            in1, in2 = "From", "To"

        category = guess_category(display_name)
        desc_html = external_descriptions.get(slug, "")

        c_type = 'standard'
        factor_val = "1"
        offset_val = "0"

        if '1/x' in factor_raw:
            c_type = 'inverse'
            factor_val = "1"
        elif 'Multi-Variable' in factor_raw:
            c_type = '3col'
            if 'amps' in slug and 'watts' in slug:
                # Based on hdyc-calculators.js, apps-to-watts is 3col-mul, watts-to-amps is 3col
                if slug == 'amps-to-watts': c_type = '3col-mul'
            if slug == 'lux-to-lumens': c_type = '3col-mul'
        elif 'Logarithmic' in factor_raw or 'Exponential' in factor_raw:
            if 'db-int' in slug or 'intensity' in slug: c_type = 'db-int'
            elif 'spl' in slug or 'sound' in slug: c_type = 'db-spl'
            elif 'volts' in slug: c_type = 'db-v'
            else: c_type = 'db-w'
        elif 'Base 60' in factor_raw:
            if slug == 'degrees-minutes-and-seconds-to-decimal-degrees': c_type = 'dms-dd'
            else: c_type = 'dd-dms'
        elif 'GCD' in factor_raw or 'string split' in factor_raw or 'fraction' in slug:
            c_type = 'dec-frac'
        elif 'N/A' in factor_raw or 'Text' in factor_raw:
            if 'ascii' in slug:
                c_type = 'text-bin' if slug.startswith('ascii') else 'bin-text'
            elif 'binary' in slug or 'hex' in slug or 'decimal' in slug or 'octal' in slug:
                c_type = 'base'
        elif '10_to_2' in factor_raw or '16_to_2' in factor_raw or '10_to_16' in factor_raw or 'base' in factor_raw.lower():
            c_type = 'base'
        elif 'Linear Offset' in factor_raw:
            # "Linear Offset (1.8x + 32)"
            m = re.search(r'Linear Offset \(([-\d\./]+)x\s*([+-]\s*[\d\.]+)\)', factor_raw)
            if m:
                f_v = m.group(1)
                # handle frac
                if '/' in f_v: f_v = str(float(f_v.split('/')[0]) / float(f_v.split('/')[1]))
                o_v = m.group(2).replace(' ', '')
                factor_val = f_v
                offset_val = o_v
            else:
                m2 = re.search(r'Linear Offset \(([-\d\./]+)x\)', factor_raw)
                if m2: factor_val = m2.group(1)
            c_type = 'standard'
        else:
            try:
                # If it's a number
                float(factor_raw)
                factor_val = factor_raw
            except:
                pass

        # Avoid escaping single quotes by using JSON or dict
        entry = {
            'slug': slug,
            'name': display_name,
            'category': category,
            'type': c_type
        }
        if teaser:
            entry['teaser'] = teaser

        # Determine labels
        labels = {'in1': in1, 'in2': in2}
        if c_type in ['3col', '3col-mul']:
            # generic 3rd label
            if 'watts' in slug and 'amps' in slug: labels['in3'] = 'Volts'
            elif 'lumens' in slug: labels['in3'] = 'Area (sq m)'
            elif 'moles' in slug: labels['in3'] = 'Molar Mass'
            else: labels['in3'] = 'Result'

        entry['labels'] = labels

        if c_type == 'standard' and factor_val != "1":
            try: entry['factor'] = float(factor_val)
            except: pass
        if c_type == 'standard' and offset_val != "0":
            try: entry['offset'] = float(offset_val)
            except: pass

        if c_type == 'base':
            if 'binary' in slug:
                if slug.startswith('binary'): entry['fromBase'] = 2
                else: entry['toBase'] = 2
            if 'hex' in slug:
                if slug.startswith('hex'): entry['fromBase'] = 16
                else: entry['toBase'] = 16
            if 'octal' in slug:
                if slug.startswith('octal'): entry['fromBase'] = 8
                else: entry['toBase'] = 8
            if 'decimal' in slug:
                if slug.startswith('decimal'): entry['fromBase'] = 10
                else: entry['toBase'] = 10
            if 'base-' in slug:
                parts = slug.split('-')
                if len(parts) >= 5 and parts[0] == 'base' and parts[2] == 'to' and parts[3] == 'base':
                    try:
                        entry.setdefault('fromBase', int(parts[1]))
                        entry.setdefault('toBase', int(parts[4]))
                    except ValueError:
                        pass
            if 'base' in factor_raw.lower():
                match = re.search(r'base\\s*(\\d+)\\s*(?:→|to)\\s*(?:base\\s*)?(\\d+)', factor_raw, re.IGNORECASE)
                if match:
                    entry.setdefault('fromBase', int(match.group(1)))
                    entry.setdefault('toBase', int(match.group(2)))

        if category == 'data' and c_type == 'standard':
            # Fix data scale names and factors
            # Megabytes, Gigabytes, Terabytes etc should use decimal base-10 sizes (1000).
            # Mebibytes, Gibibytes, Tebibytes should use binary base-2 sizes (1024).

            # Create a simple mapping for names to exponents to calculate strict factors
            units_10 = {'byte': 0, 'kilobyte': 3, 'megabyte': 6, 'gigabyte': 9, 'terabyte': 12, 'petabyte': 15, 'exabyte': 18}
            units_2 = {'kibibyte': 10, 'mebibyte': 20, 'gibibyte': 30, 'tebibyte': 40, 'pebibyte': 50, 'exbibyte': 60}

            in1_key = in1.lower()
            if in1_key.endswith('s'): in1_key = in1_key[:-1]
            in2_key = in2.lower()
            if in2_key.endswith('s'): in2_key = in2_key[:-1]

            # Helper to get base and exp
            def get_val(k):
                if k in units_10: return 10, units_10[k]
                if k in units_2: return 2, units_2[k]
                if k == 'bit': return 10, -1 # placeholder relative to bytes, though bits are 1/8 byte. Handling simple bytes here only
                return None, None

            b1, e1 = get_val(in1_key)
            b2, e2 = get_val(in2_key)

            if b1 and b2 and b1 == b2 and b1 == 10:
                # Decimal to decimal
                factor = 10 ** (e1 - e2)
                factor_val = str(factor)
                try: entry['factor'] = float(factor_val)
                except: pass
            elif b1 and b2 and b1 == b2 and b1 == 2:
                # Binary to binary
                factor = 2 ** (e1 - e2)
                factor_val = str(factor)
                try: entry['factor'] = float(factor_val)
                except: pass
            elif b1 and b2:
                # Cross conversion
                val1 = (10 ** e1) if b1 == 10 else (2 ** e1)
                val2 = (10 ** e2) if b2 == 10 else (2 ** e2)
                factor = val1 / val2
                try: entry['factor'] = float(factor)
                except: pass

        # Remove empty descriptions
        if desc_html:
            entry['descriptionHTML'] = desc_html.replace('"', '\\"').replace('\n', '')

        calculators_ts_entries.append(entry)

    # Mark duplicates / reverse pairs
    # E.g. If "A to B" has a factor F, and "B to A" exists
    existing_slugs = [e['slug'] for e in calculators_ts_entries]
    for e in calculators_ts_entries:
        # Check if inverse exists. We hide the one with the smaller factor (usually < 1) or hide alphabetical later one.
        # But a better heuristic: reverse of split(' to ')
        parsed = split_conversion_name(e['name'])
        if parsed:
            rev_name = f"{parsed[1]} to {parsed[0]}"
            rev_slug = rev_name.lower().replace(' ', '-')
            if rev_slug in existing_slugs and e['slug'] != rev_slug:
                # hide one of them. We'll hide the one where factor < 1, or if both 1, arbitrarily
                if 'factor' in e and e['factor'] < 1.0:
                    e['hidden'] = True
                elif 'factor' not in e:
                    # just hide alphabetical later
                    if e['slug'] > rev_slug:
                        e['hidden'] = True

    # Ensure types are right
    # write to TS
    out = """// THIS FILE IS AUTO-GENERATED BY migrate.py
export type CalcType = 'standard' | 'inverse' | '3col' | '3col-mul' | 'base' | 'text-bin' | 'bin-text' | 'dms-dd' | 'dd-dms' | 'dec-frac' | 'db-int' | 'db-spl' | 'db-v' | 'db-w';

export interface CalculatorDef {
  slug: string;
  name: string;
  category: string;
  type: CalcType;
  hidden?: boolean;
  factor?: number;
  offset?: number;
  fromBase?: number;
  toBase?: number;
  labels: { in1: string; in2: string; in3?: string };
  descriptionHTML?: string;
  teaser?: string;
}

export const categories: Record<string, { label: string; icon: string }> = {
  length:          { label: 'Length / Distance',   icon: '📏' },
  weight:          { label: 'Weight / Mass',       icon: '⚖️' },
  temperature:     { label: 'Temperature',         icon: '🌡️' },
  volume:          { label: 'Volume',              icon: '🧪' },
  fluids:          { label: 'Fluids',              icon: '💧' },
  area:            { label: 'Area',                icon: '📐' },
  speed:           { label: 'Speed / Velocity',    icon: '💨' },
  pressure:        { label: 'Pressure',            icon: '🔽' },
  energy:          { label: 'Energy',              icon: '⚡' },
  currency:        { label: 'Currency',            icon: '💱' },
  magnetism:       { label: 'Magnetism',           icon: '🧲' },
  power:           { label: 'Power',               icon: '🔌' },
  data:            { label: 'Data Storage',        icon: '💾' },
  time:            { label: 'Time',                icon: '⏱️' },
  angle:           { label: 'Angle',               icon: '📐' },
  'number-systems':{ label: 'Number Systems',      icon: '🔢' },
  radiation:       { label: 'Radiation',           icon: '☢️' },
  electrical:      { label: 'Electrical',          icon: '🔋' },
  force:           { label: 'Force / Torque',      icon: '💪' },
  light:           { label: 'Light',               icon: '💡' },
  other:           { label: 'Other',               icon: '🔄' },
};

export const calculators: CalculatorDef[] = [
"""
    for e in calculators_ts_entries:
        desc = e.pop('descriptionHTML', '')
        e_str = json.dumps(e)
        if desc:
            # manually inject descriptionHTML into json representation without double encoding html
            out += f"  {{...{e_str}, descriptionHTML: `{desc}`}},\n"
        else:
            out += f"  {e_str},\n"

    out += """
];

const slugIndex = new Map(calculators.map(c => [c.slug, c]));

export function getCalculatorBySlug(slug: string): CalculatorDef | undefined {
  return slugIndex.get(slug);
}

export function getCalculatorsByCategory(category: string): CalculatorDef[] {
  return calculators.filter(c => c.category === category && !c.hidden);
}

export function getCategoriesWithCounts(): { key: string; label: string; icon: string; count: number }[] {
  return Object.entries(categories).map(([key, meta]) => ({
    key,
    ...meta,
    count: calculators.filter(c => c.category === key && !c.hidden).length,
  }));
}

export function searchCalculators(query: string): CalculatorDef[] {
  const q = query.toLowerCase();
  return calculators.filter(c =>
    (c.name.toLowerCase().includes(q) ||
    c.slug.includes(q) ||
    c.labels.in1.toLowerCase().includes(q) ||
    c.labels.in2.toLowerCase().includes(q)) && !c.hidden
  );
}
"""
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        f.write(out)

    print(f"Generated {len(calculators_ts_entries)} calculators into calculators.ts")

if __name__ == '__main__':
    process()