hyperrogue/langen.cpp

// Hyperbolic Rogue language file generator

// Copyright (C) 2011-2018 Zeno Rogue, see 'hyper.cpp' for details

#include <map>
#include <string>
#include <cstdio>
#include <vector>
#include <cstdlib>
#include <set>
#include <cstring>

#define GEN_M 0
#define GEN_F 1
#define GEN_N 2
#define GEN_O 3

#define GENF_ELISION    (1 << 3)
#define GENF_PLURALONLY (1 << 4)
#define GENF_PROPER     (1 << 5)

#if MAC
 #define IF_MAC(y,z) y
#else
 #define IF_MAC(y,z) z
#endif

template<class T> int isize(const T& x) { return x.size(); }

#define NUMLAN 9

FILE *f;

// language generator

std::string current_language;

const char *escape(std::string s, const std::string& dft);

template<class T> struct dictionary {
  std::map<std::string, T> m;
  void add(const std::string& s, T val) {
    auto it = m.find(s);
    if (it == m.end()) {
      m.emplace(s, std::move(val));
      }
    else if (val != it->second) {
      fprintf(f, "// #warning Two translations for %s [%s]\n", escape(s, s), current_language.c_str());
      }
    }
  T& operator [] (const std::string& s) { return m[s]; }
  int count(const std::string& s) const { return m.count(s); }
  };

dictionary<std::string> d[NUMLAN];

struct noun2 {
  int genus;
  const char *nom;
  const char *nomp;
  const char *acc;
  const char *abl;
  };

struct noun {
  int genus;
  std::string nom, nomp, acc, abl;
  noun() = default;
  noun(const noun2& n) : genus(n.genus), nom(n.nom), nomp(n.nomp), acc(n.acc), abl(n.abl) {}
  friend bool operator==(const noun& a, const noun& b) {
    return std::tie(a.genus, a.nom, a.nomp, a.acc, a.abl) == std::tie(b.genus, b.nom, b.nomp, b.acc, b.abl);
    }
  friend bool operator!=(const noun& a, const noun& b) {
    return std::tie(a.genus, a.nom, a.nomp, a.acc, a.abl) != std::tie(b.genus, b.nom, b.nomp, b.acc, b.abl);
    }
  };

dictionary<noun> nouns[NUMLAN];

int utfsize(char c) {
  unsigned char cu = c;
  if(cu < 128) return 1;
  if(cu < 224) return 2;
  if(cu < 0xF0) return 3;
  return 4;
  }

void addutftoset(std::set<std::string>& s, const std::string& w) {
  size_t i = 0;
  while(i < w.size()) {
    int siz = utfsize(w[i]);
    s.insert(w.substr(i, siz));
    i += siz;
    }
  }

void addutftoset(std::set<std::string>& s, const noun& w) {
  addutftoset(s, w.nom);
  addutftoset(s, w.nomp);
  addutftoset(s, w.acc);
  addutftoset(s, w.abl);
  }

template<class T>
void addutftoset(std::set<std::string>& s, const dictionary<T>& w) {
  for(auto&& elt : w.m)
    addutftoset(s, elt.second);
  }

std::set<std::string> allchars;

typedef unsigned hashcode;

hashcode hashval;

hashcode langhash(const std::string& s) {
  hashcode r = 0;
  for (char ch : s) r = hashval * r + ch;
  return r;
  }

std::map<hashcode, std::string> buildHashTable(std::set<std::string>& s) {
  std::map<hashcode, std::string> res;
  for(auto&& elt : s)
    res[langhash(elt)] = elt;
  return res;
  }

const char *escape(std::string s, const std::string& dft) {
  if(s == "") {
    fprintf(f, "/*MISSING*/ ");
    s = dft;
    }
  static std::string t;
  t = "\"";
  for(int i=0; i<isize(s); i++)
    if(s[i] == '\\') t += "\\\\";
    else if(s[i] == '\n') t += "\\n";
    else if(s[i] == '\"') t += "\\\"";
    else t += s[i];
  t += "\"";
  return t.c_str();
  }

std::set<std::string> nothe;
std::set<std::string> plural;


 void langPL() {
  current_language = "PL";
  static std::pair<const char *, const char *> ds[] = {
    #define S(a,b) { a, b },
    #define N(a,b,c,d,e,f)
    #include "language-pl.cpp"
    #undef N
    #undef S
    };
  static std::pair<const char *, noun2> ns[] = {
    #define S(a,b)
    #define N(a,b,c,d,e,f) { a, noun2{ b, c, d, e, f } },
    #include "language-pl.cpp"
    #undef N
    #undef S
    };
  for(auto&& elt : ds) d[1].add(elt.first, elt.second);
  for(auto&& elt : ns) nouns[1].add(elt.first, elt.second);
  current_language = "-";
}

void langTR() {
  current_language = "TR";
  static std::pair<const char *, const char *> ds[] = {
    #define S(a,b) { a, b },
    #define N(a,b,c,d,e,f)
    #include "language-tr.cpp"
    #undef N
    #undef S
    };
  static std::pair<const char *, noun2> ns[] = {
    #define S(a,b)
    #define N(a,b,c,d,e,f) { a, noun2{ b, c, d, e, f } },
    #include "language-tr.cpp"
    #undef N
    #undef S
    };
  for(auto&& elt : ds) d[2].add(elt.first, elt.second);
  for(auto&& elt : ns) nouns[2].add(elt.first, elt.second);
  current_language = "-";
  }

void langCZ() {
  current_language = "CZ";
  static std::pair<const char *, const char *> ds[] = {
    #define S(a,b) { a, b },
    #define N(a,b,c,d,e,f)
    #include "language-cz.cpp"
    #undef N
    #undef S
    };
  static std::pair<const char *, noun2> ns[] = {
    #define S(a,b)
    #define N(a,b,c,d,e,f) { a, noun2{ b, c, d, e, f } },
    #include "language-cz.cpp"
    #undef N
    #undef S
    };
  for(auto&& elt : ds) d[3].add(elt.first, elt.second);
  for(auto&& elt : ns) nouns[3].add(elt.first, elt.second);
  current_language = "-";
  }

void langRU() {
  current_language = "RU";
  static std::pair<const char *, const char *> ds[] = {
    #define S(a,b) { a, b },
    #define N(a,b,c,d,e,f)
    #include "language-ru.cpp"
    #undef N
    #undef S
    };
  static std::pair<const char *, noun2> ns[] = {
    #define S(a,b)
    #define N(a,b,c,d,e,f) { a, noun2{ b, c, d, e, f } },
    #include "language-ru.cpp"
    #undef N
    #undef S
    };
  for(auto&& elt : ds) d[4].add(elt.first, elt.second);
  for(auto&& elt : ns) nouns[4].add(elt.first, elt.second);
  current_language = "-";
  }

void langDE() {
  current_language = "DE";
  static std::pair<const char *, const char *> ds[] = {
    #define S(a,b) { a, b },
    #define N(a,b,c,d,e)
    #include "language-de.cpp"
    #undef N
    #undef S
    };
  static std::pair<const char *, noun2> ns[] = {
    #define S(a,b)
    #define N(a,b,c,d,e) { a, noun2{ b, c, d, e, e } },
    #include "language-de.cpp"
    #undef N
    #undef S
    };
  for(auto&& elt : ds) d[5].add(elt.first, elt.second);
  for(auto&& elt : ns) nouns[5].add(elt.first, elt.second);
  current_language = "-";
  }

void langPT() {
  current_language = "PT";
  static std::pair<const char *, const char *> ds[] = {
    #define S(a,b) { a, b },
    #define N(a,b,c,d,e)
    #include "language-ptbr.cpp"
    #undef N
    #undef S
    };
  static std::pair<const char *, noun2> ns[] = {
    #define S(a,b)
    #define N(a,b,c,d,e) { a, noun2{ b, c, d, "", e } },
    #include "language-ptbr.cpp"
    #undef N
    #undef S
    };
  for(auto&& elt : ds) d[6].add(elt.first, elt.second);
  for(auto&& elt : ns) nouns[6].add(elt.first, elt.second);
  current_language = "-";
  }

void langFR() {
  current_language = "FR";
  static std::pair<const char *, const char *> ds[] = {
    #define S(a,b) { a, b },
    #define N(a,b,c,d,e)
    #include "language-fr.cpp"
    #undef N
    #undef S
    };
  static std::pair<const char *, noun2> ns[] = {
    #define S(a,b)
    #define N(a,b,c,d,e) { a, noun2{ b, c, d, e, e } },
    #include "language-fr.cpp"
    #undef N
    #undef S
    };
  for(auto&& elt : ds) d[7].add(elt.first, elt.second);
  for(auto&& elt : ns) nouns[7].add(elt.first, elt.second);
  current_language = "-";
  }

void langZH() {
  current_language = "ZH";
  static std::pair<const char *, const char *> ds[] = {
    #define S(a,b) { a, b },
    #define N(a,b,c,d,e)
    #include "language-zh.cpp"
    #undef N
    #undef S
    };
  static std::pair<const char *, noun2> ns[] = {
    #define S(a,b)
    #define N(a,b,c,d,e) { a, noun2{ b, c, c, d, e } },
    #include "language-zh.cpp"
    #undef N
    #undef S
    };
  for(auto&& elt : ds) d[8].add(elt.first, elt.second);
  for(auto&& elt : ns) nouns[8].add(elt.first, elt.second);
  current_language = "-";
  }

int completeness[NUMLAN];

template<class T>
void compute_completeness(const T& dict)
{
  std::set<std::string> s;
  for(int i=1; i<NUMLAN; i++)
    for(auto&& elt : dict[i].m)
      s.insert(elt.first);

  for(auto&& elt : s) {
    std::string mis = "", mis1 = "", exist_in = "";
    bool in_important = false;
    for(int i=1; i<NUMLAN; i++) if(dict[i].count(elt) == 0) {
      std::string which = d[i]["EN"];
      if(which != "TR" && which != "DE" && which != "PT-BR" && which != "RU")
        mis += which + " ";
      else
        mis1 += which + " ";
      }
    else {
      std::string which = d[i]["EN"];
      if(which != "PT-BR" && which != "TR" && which != "DE")
        in_important = true;
      exist_in += which + " ";
      }
    if(mis != "") mis.pop_back();
    if(mis1 != "") mis1.pop_back();
    if(exist_in != "") exist_in.pop_back();
    if(in_important && mis != "")
      fprintf(f, "// #warning Missing [%s : %s] from [%s]: %s\n", mis.c_str(), mis1.c_str(), exist_in.c_str(), escape(elt, "?"));

    completeness[0]++;
    for(int i=1; i<NUMLAN; i++) if(dict[i].count(elt)) completeness[i]++;
    }
  }

int main(int argc, char ** argv) {

  f = stdout;
  if(argc == 3 && strcmp(argv[1], "-o") == 0) f = fopen(argv[2], "wt");
  else if(argc != 1) {
    printf("Usage: langen -o <filename>, or without arguments to output to standard output\n");
    exit(1);
    }

  fprintf(f, "// DO NOT EDIT -- this file is generated automatically with langen\n\n");

  nothe.insert("R'Lyeh");
  nothe.insert("Camelot");
  nothe.insert("Hell");
  plural.insert("Crossroads");
  plural.insert("Crossroads II");
  plural.insert("Crossroads III");
  plural.insert("Elemental Planes");
  plural.insert("Crossroads IV");
  plural.insert("Kraken Depths");
  for(auto s: {
    // superscripts
    "¹", "²", "³", "⁻", "ᵈ", "ᶻ",
    // other math symbols
    "∞", "½", "√", "×",
    // Greek letters
    "α", "β", "γ", "δ", "ε", "ζ", "η", "θ", "ι", "κ", "λ", "μ", "ν", "ξ", "ο", "π", "ρ", "σ", "τ", "υ", "φ", "χ", "ψ", "ω",
    // Greek capital letters
    "Δ", "Ω", "Θ", "Λ", "Γ", "Ξ", "Π", "Σ", "Φ", "Ψ",
    // other symbols
    "⌫", "⏎", "←", "→", "␣",
    // international letters
    "ñ", "Ñ", "ê", "Ç", "Ą", "Ć", "Ę", "Ł", "Ń", "Ó", "Ś", "Ź", "Ż"
    })
    allchars.insert(s);

  langPL(); langCZ(); langRU();
  langTR(); langDE(); langPT();
  langFR(); langZH();

  // verify
  compute_completeness(d);
  compute_completeness(nouns);

  for(int i=1; i<NUMLAN; i++) {
    addutftoset(allchars, d[i]);
    addutftoset(allchars, nouns[i]);
    }

  std::string javastring;
  std::vector<std::string> vchars;
  for(auto&& elt : allchars) {
    if(isize(elt) >= 2) { javastring += elt; vchars.push_back(elt); }
    }
  fprintf(f, "\n");
  fprintf(f, "#if HDR\n");
  fprintf(f, "#if CAP_TRANS\n");
  fprintf(f, "#define NUMEXTRA %d\n", isize(vchars));
  fprintf(f, "#define NATCHARS {");
  for(auto&& elt : vchars) fprintf(f, "\"%s\",", elt.c_str());
  fprintf(f, "}\n");
  fprintf(f, "extern const char* natchars[NUMEXTRA];\n");
  fprintf(f, "#endif\n");
  fprintf(f, "#endif\n");
  fprintf(f, "const char* natchars[NUMEXTRA] = NATCHARS;\n");
  fprintf(f, "//javastring = \"%s\";\n", javastring.c_str());

  fprintf(f, "\nEX int transcompleteness[NUMLAN] = {");
  for(int i=0; i<NUMLAN; i++) fprintf(f, "%d, ", completeness[i]);
  fprintf(f, "};\n");

  fprintf(f, "\n//statistics\n");
  for(auto&& elt : d[1].m)
    d[0][elt.first] = elt.first;
  for(auto&& elt : nouns[1].m) {
    noun n = elt.second;
    n.nom = n.nomp = n.acc = n.abl = elt.first;
    nouns[0][elt.first] = n;
    }

  fprintf(f, "// total: %5d nouns, %5d sentences\n", isize(nouns[1].m), isize(d[1].m));

  for(int i=0; i<NUMLAN; i++) {
    size_t bnouns = 0;
    size_t bdict = 0;

    for(auto&& elt : d[i].m)
      bdict += elt.second.size();
    for(auto&& elt : nouns[i].m) {
      const noun& n = elt.second;
      bnouns += n.nom.size();
      bnouns += n.nomp.size();
      bnouns += n.acc.size();
      bnouns += n.abl.size();
      }

    fprintf(f, "// %s: %5dB nouns, %5dB sentences\n",
      d[i]["EN"].c_str(), int(bnouns), int(bdict));
    }

  std::set<std::string> allsent;
  for(auto&& elt : d[1].m)
    allsent.insert(elt.first);

  std::set<std::string> allnouns;
  for(auto&& elt : nouns[1].m)
    allnouns.insert(elt.first);

  std::map<hashcode, std::string> ms, mn;

  do {
    hashval = rand();
    fprintf(f, "// check hash: %x\n", hashval);
    ms = buildHashTable(allsent);
    mn = buildHashTable(allnouns);
    }
  while(ms.size() != allsent.size() || mn.size() != allnouns.size());

  fprintf(f, "hashcode hashval = 0x%x;\n\n", hashval);

  fprintf(f, "sentence all_sentences[] = {\n");

  for(auto&& elt : ms) {
    const std::string& s = elt.second;
    fprintf(f, "  {0x%x, { // %s\n", elt.first, escape(s, s));
    for(int i=1; i<NUMLAN; i++) fprintf(f, "   %s,\n", escape(d[i][s], s));
    fprintf(f, "    }},\n");
    }
  fprintf(f, "  };\n\n");

  fprintf(f, "fullnoun all_nouns[] = {\n");

  for(auto&& elt : mn) {
    const std::string& s = elt.second;
    fprintf(f, "  {0x%x, %d, { // \"%s\"\n", elt.first,
      (nothe.count(s) ? 1:0) + (plural.count(s) ? 2:0),
      escape(s, s));

    for(int i=1; i<NUMLAN; i++) {
      fprintf(f, "    {%d", nouns[i][s].genus);
      fprintf(f, ", %s", escape(nouns[i][s].nom, s));
      fprintf(f, ", %s", escape(nouns[i][s].nomp, s));
      fprintf(f, ", %s", escape(nouns[i][s].acc, s));
      fprintf(f, ", %s},\n", escape(nouns[i][s].abl, s));
      }

    fprintf(f, "    }},\n");
    }

  fprintf(f, "  };\n");

  if(f != stdout) fclose(f);
  }