#!/usr/bin/env python3 """Generate a JSON representation of the bilingual LaTeX content.""" from __future__ import annotations import json import re from pathlib import Path from typing import Dict, List, Tuple ROOT = Path(__file__).resolve().parent.parent MAIN_TEX = ROOT / "main.tex" OUTPUT_JSON = ROOT / "web" / "translation-data.json" GLOSSARY_TEX = ROOT / "content" / "glossar.tex" FOOTNOTES_TEX = ROOT / "content" / "footnotes.tex" TRANS_MACROS = { "\\transSec": "section", "\\transSubSec": "subsection", "\\trans": "paragraph", } def _load_content_files() -> List[Path]: files: List[Path] = [] content_pattern = re.compile(r"\\input\{([^}]+)\}") for line in MAIN_TEX.read_text(encoding="utf-8").splitlines(): match = content_pattern.search(line) if match: rel_path = match.group(1) candidate = (ROOT / rel_path).resolve() if candidate.suffix == "": candidate = candidate.with_suffix(".tex") files.append(candidate) return files def _strip_comments(text: str) -> str: return re.sub(r"%.*", "", text) def _parse_braced_argument(text: str, start: int) -> Tuple[str, int]: if text[start] != "{": raise ValueError(f"Expected '{{' at position {start}") depth = 0 i = start content: List[str] = [] i += 1 while i < len(text): ch = text[i] if ch == "{": depth += 1 content.append(ch) elif ch == "}": if depth == 0: return "".join(content), i + 1 depth -= 1 content.append(ch) else: content.append(ch) i += 1 raise ValueError("Unterminated braced argument") def _parse_macro(text: str, start: int) -> Tuple[Dict[str, object], int]: for macro, entry_type in TRANS_MACROS.items(): if text.startswith(macro, start): i = start + len(macro) while i < len(text) and text[i].isspace(): i += 1 arg1, i = _parse_braced_argument(text, i) while i < len(text) and text[i].isspace(): i += 1 arg2, i = _parse_braced_argument(text, i) return { "type": entry_type, "danish_raw": arg1.strip(), "german_raw": arg2.strip(), }, i raise ValueError(f"Unsupported macro at position {start}") def _parse_heading(line: str) -> Tuple[str | None, str | None]: match = re.match(r"\\(sub)?section\*?\{([^}]*)\}", line) if not match: return None, None level = "h3" if match.group(1) else "h2" return level, match.group(2) def _latex_to_html(text: str) -> str: """Convert the limited LaTeX markup in the sources to HTML.""" result: List[str] = [] i = 0 while i < len(text): ch = text[i] if ch == "\\": if i + 1 < len(text) and text[i + 1] == "\\": result.append("
") i += 2 continue j = i + 1 while j < len(text) and text[j].isalpha(): j += 1 command = text[i + 1 : j] i = j command_lower = command.lower() def consume_argument() -> str: nonlocal i while i < len(text) and text[i].isspace(): i += 1 if i >= len(text) or text[i] != "{": return "" arg, i = _parse_braced_argument(text, i) return arg if command_lower == "textcolor": color_raw = consume_argument() content_raw = consume_argument() color = _latex_to_html(color_raw) inner = _latex_to_html(content_raw) result.append(f'{inner}') elif command_lower == "emph": content_raw = consume_argument() result.append(f"{_latex_to_html(content_raw)}") elif command_lower == "enquote": content_raw = consume_argument() result.append(f"„{_latex_to_html(content_raw)}“") elif command_lower == "alt": first_raw = consume_argument() second_raw = consume_argument() first = _latex_to_html(first_raw) second = _latex_to_html(second_raw) result.append(f'{first} / {second}') elif command_lower in {"anm", "todo", "evl"}: content_raw = consume_argument() result.append(_latex_to_html(content_raw)) else: content_raw = consume_argument() if content_raw: result.append(_latex_to_html(content_raw)) elif ch == "$": end = text.find("$", i + 1) if end != -1: content = text[i + 1 : end] match = re.match(r"\^\{$([^}]+)$\}", content) if match: footnote_id = match.group(1) result.append( f'^{{footnote_id}}' ) else: result.append(content) i = end + 1 else: result.append("$") i += 1 elif ch in "{}": i += 1 else: result.append(ch) i += 1 html = "".join(result) html = html.replace("---", "—") html = html.replace("--", "–") return html.strip() def _split_paragraphs(text: str) -> List[str]: raw_parts = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()] if not raw_parts: raw_parts = [text.strip()] if text.strip() else [] cleaned: List[str] = [] for part in raw_parts: normalized = re.sub(r"\s+", " ", part) cleaned.append(normalized.strip()) return cleaned def parse_glossary(path: Path) -> str: text = _strip_comments(path.read_text(encoding="utf-8")) blocks: List[str] = [] paragraph_lines: List[str] = [] def flush_paragraph() -> None: nonlocal paragraph_lines if not paragraph_lines: return paragraph_text = " ".join(paragraph_lines).strip() if paragraph_text: blocks.append(f"

{_latex_to_html(paragraph_text)}

") paragraph_lines = [] for raw_line in text.splitlines(): line = raw_line.strip() if not line: flush_paragraph() continue level, title = _parse_heading(line) if level and title: flush_paragraph() blocks.append(f"<{level}>{_latex_to_html(title)}") continue paragraph_lines.append(line) flush_paragraph() return "\n".join(blocks) def _extract_footnote_id(html: str) -> str | None: match = re.search(r'data-footnote-id="([^"]+)"', html) if match: return match.group(1) return None def parse_footnotes(path: Path) -> List[Dict[str, str]]: raw_text = _strip_comments(path.read_text(encoding="utf-8")) pattern = re.compile(r"\$\^\{$([^)]+)$\}\s*", re.MULTILINE) matches = list(pattern.finditer(raw_text)) ordered_ids: List[str] = [] footnote_map: Dict[str, Dict[str, str]] = {} for idx, match in enumerate(matches): start = match.end() end = matches[idx + 1].start() if idx + 1 < len(matches) else len(raw_text) note_text = raw_text[start:end].strip() cleaned_text = note_text.strip("{} ") cleaned_text = re.sub(r"^\$\s*", "", cleaned_text) html_text = _latex_to_html(cleaned_text) note_id = match.group(1) if note_id not in footnote_map: ordered_ids.append(note_id) footnote_map[note_id] = {"id": note_id, "danish": "", "german": ""} entry = footnote_map[note_id] if not entry["danish"]: entry["danish"] = html_text elif not entry["german"]: entry["german"] = html_text else: entry["german"] += f" {html_text}" for entry in footnote_map.values(): if not entry["german"]: entry["german"] = entry["danish"] return [footnote_map[note_id] for note_id in ordered_ids] def parse_file(path: Path) -> List[Dict[str, object]]: entries: List[Dict[str, object]] = [] text = _strip_comments(path.read_text(encoding="utf-8")) i = 0 while i < len(text): if text[i] == "\\": try: entry, i = _parse_macro(text, i) except ValueError: i += 1 continue entry["danish"] = [_latex_to_html(p) for p in _split_paragraphs(entry.pop("danish_raw"))] entry["german"] = [_latex_to_html(p) for p in _split_paragraphs(entry.pop("german_raw"))] entries.append(entry) else: i += 1 return entries def main() -> None: content_files = [path for path in _load_content_files() if path.name != "glossar.tex"] data: List[Dict[str, object]] = [] for file_path in content_files: data.extend(parse_file(file_path)) glossary_html = parse_glossary(GLOSSARY_TEX) footnotes = parse_footnotes(FOOTNOTES_TEX) payload = {"glossary": glossary_html, "entries": data, "footnotes": footnotes} json_text = json.dumps(payload, ensure_ascii=False, indent=2) OUTPUT_JSON.write_text(json_text + "\n", encoding="utf-8") if __name__ == "__main__": main()