hyperrogue/langen.cpp

// Hyperbolic Rogue language file generator

// Copyright (C) 2011-2018 Zeno Rogue, see 'hyper.cpp' for details

#define GEN_M 0
#define GEN_F 1
#define GEN_N 2
#define GEN_O 3

#include <map>
#include <string>
#include <stdio.h>
#include <vector>
#include <stdlib.h>
using namespace std;

template<class T> int size(T x) { return x.size(); }

#define NUMLAN 7

// language generator

const char *escape(string s, string dft);

template<class T> struct dictionary {
  map<string, T> m;
  void add(const string& s, const T& val) {
    if(m.count(s)) add(s + " [repeat]", val);
    else m[s] = val;
    }
  T& operator [] (const string& s) { return m[s]; }
  int count(const string& s) { return m.count(s); }
  void clear() { m.clear(); }
  };

dictionary<string> d[NUMLAN];

struct noun {
  int genus;
  string nom, nomp, acc, abl;
  };

dictionary<noun> nouns[NUMLAN];

#include <set>

int utfsize(char c) {
  unsigned char cu = c;
  if(cu < 128) return 1;
  if(cu < 224) return 2;
  if(cu < 0xE0) return 3;
  return 4;
  }

void addutftoset(set<string>& s, string& w) {
  int i = 0;
//printf("%s\n", w.c_str());
  while(i < size(w)) {
    int siz = utfsize(w[i]);
    s.insert(w.substr(i, siz));
    i += siz;
    }
  }

void addutftoset(set<string>& s, noun& w) {
  addutftoset(s, w.nom);
  addutftoset(s, w.nomp);
  addutftoset(s, w.acc);
  addutftoset(s, w.abl);
  }

template<class T>
void addutftoset(set<string>& s, dictionary<T>& w) {
  for(typename map<string,T>::iterator it = w.m.begin(); it != w.m.end(); it++)
    addutftoset(s, it->second);
  }

set<string> allchars;

void printletters(dictionary<string>& la, dictionary<noun>& nounla, const char *lang) {
  set<string> s;
  addutftoset(s, la);
  addutftoset(s, nounla);
  addutftoset(allchars, la);
  addutftoset(allchars, nounla);
//printf("%s:", lang);
//for(set<string>::iterator it = s.begin(); it != s.end(); it++)
//  printf(" \"%s\",", it->c_str());
//printf("\n");
  }

typedef unsigned hashcode;

hashcode hashval;

bool isrepeat(const string& s) {
  return s.find(" [repeat]") != string::npos;
  }

hashcode langhash(const string& s) {
  if(isrepeat(s)) {
    return langhash(s.substr(0, s.size() - 9)) + 1;
    }
  hashcode r = 0;
  for(int i=0; i<size(s); i++) r = hashval * r + s[i];
  return r;
  }

map<hashcode, string> buildHashTable(set<string>& s) {
  map<hashcode, string> res;
  for(set<string>::iterator it = s.begin(); it != s.end(); it++)
    res[langhash(*it)] = *it;
  return res;
  }

const char *escape(string s, string dft) {
  if(s == "") {
    printf("/*MISSING*/ ");
    s = dft;
    }
  static string t;
  t = "\"";
  for(int i=0; i<size(s); i++)
    if(s[i] == '\\') t += "\\\\";
    else if(s[i] == '\n') t += "\\n";
    else if(s[i] == '\"') t += "\\\"";
    else t += s[i];
  t += "\"";
  return t.c_str();
  }

set<string> nothe;
set<string> plural;

#ifdef CHECKALL
const char* allstr[] = {
#include "d"
  };
#endif

void setstats(set<string>& s, const char* bn) {
  int tlen=0, tc = 0;
  for(set<string>::iterator it = s.begin(); it != s.end(); it++)
    tc++, tlen += it->size();
  printf("// %-10s %5d %5d\n", bn, tc, tlen);
  }

void langPL() {
  #define S(a,b) d[1].add(a,b);
  #define N(a,b,c,d,e,f) \
    {noun n; n.genus = b; n.nom = c; n.nomp = d; n.acc = e; n.abl = f; nouns[1].add(a,n);}
  #include "language-pl.cpp"
  #undef N
  #undef S
  }

void langTR() {
#define S(a,b) d[2].add(a,b);
#define N5(a,b,c,d,e) \
  {noun n; n.genus = b; n.nom = c; n.nomp = d; n.acc = e; n.abl = e; nouns[2].add(a,n);}
#define N(a,b,c,d,e,f) \
  {noun n; n.genus = b; n.nom = c; n.nomp = d; n.acc = e; n.abl = f; nouns[2].add(a,n);}
#include "language-tr.cpp"
#undef N
#undef S
  }

void langCZ() {
#define S(a,b) d[3].add(a,b);
#define N(a,b,c,d,e,f) \
  {noun n; n.genus = b; n.nom = c; n.nomp = d; n.acc = e; n.abl = f; nouns[3].add(a,n);}
#include "language-cz.cpp"
#undef N
#undef S
  }

void langRU() {
#define S(a,b) d[4].add(a,b);
#define N(a,b,c,d,e,f) \
  {noun n; n.genus = b; n.nom = c; n.nomp = d; n.acc = e; n.abl = f; nouns[4].add(a,n);}
#include "language-ru.cpp"
#undef N
#undef S
  }

void langDE() {
#define S(a,b) d[5].add(a,b);
#define N(a,b,c,d,e) \
  {noun n; n.genus = b; n.nom = c; n.nomp = d; n.acc = e; n.abl = e; nouns[5].add(a,n);}
#include "language-de.cpp"
#undef N
#undef S
  }

void langPT() {
#define S(a,b) d[6].add(a,b);
#define N(a,b,c,d,e) \
  {noun n; n.genus = b; n.nom = c; n.nomp = d; n.abl = e; nouns[6].add(a,n);}
#include "language-ptbr.cpp"
#undef N
#undef S
  }

int completeness[NUMLAN];

int main() {

  nothe.insert("R'Lyeh");
  nothe.insert("Camelot");
  plural.insert("Crossroads");
  plural.insert("Crossroads II");
  plural.insert("Crossroads III");
  plural.insert("Elemental Planes");
  plural.insert("Crossroads IV");
  plural.insert("Kraken Depths");
  allchars.insert("ᵈ");
  allchars.insert("δ");
  allchars.insert("∞");
  allchars.insert("½");
  allchars.insert("²");

  langPL(); langCZ(); langRU();
  langTR(); langDE(); langPT();

  // verify
  set<string> s;
  for(int i=1; i<NUMLAN; i++)
    for(map<string,string>::iterator it = d[i].m.begin(); it != d[i].m.end(); it++)
      s.insert(it->first);

  printf("// DO NOT EDIT -- this file is generated automatically with langen\n\n");

  for(set<string>::iterator x=s.begin(); x != s.end(); x++) {
    string mis = "", mis1 = "";
    for(int i=1; i<NUMLAN; i++) if(d[i].count(*x) == 0) {
      string which = d[i]["EN"];
      if(which != "TR" && which != "DE" && which != "PT-BR")
        mis += which;
      else
        mis1 += which;
      }
    if(mis != "" && !isrepeat(*x))
      printf("// #warning Missing [%s/%s]: %s\n", mis.c_str(), mis1.c_str(), escape(*x, "?"));

    if(!isrepeat(*x)) {
      completeness[0]++;
      for(int i=1; i<NUMLAN; i++) if(d[i].count(*x)) completeness[i]++;
      }
    }

  s.clear();

  for(int i=1; i<NUMLAN; i++)
    for(map<string,noun>::iterator it = nouns[i].m.begin(); it != nouns[i].m.end(); it++)
      s.insert(it->first);

  for(set<string>::iterator x=s.begin(); x != s.end(); x++) {
    string mis = "", mis1 = "";
    for(int i=1; i<NUMLAN; i++) if(nouns[i].count(*x) == 0) {
      string which = d[i]["EN"];
      if(which != "TR" && which != "DE" && which != "PT-BR")
        mis += which;
      else mis1 += which;
      }
    if(mis != "" && !isrepeat(*x))
      printf("// #warning Missing [%s/%s]: %s\n", mis.c_str(), mis1.c_str(), escape(*x, "?"));

    if(!isrepeat(*x)) {
      completeness[0]++;
      for(int i=1; i<NUMLAN; i++) if(nouns[i].count(*x)) completeness[i]++;
      }
    }

#ifdef CHECKALL
  for(int i=1; i<NUMLAN; i++)
    for(map<string,string>::iterator it = d[i].m.begin(); it != d[i].m.end(); it++)
      s.insert(it->first);

  int ca = sizeof(allstr) / sizeof(char*);
  for(int i=0; i<ca; i++) if(!s.count(allstr[i])) {
    printf("#warning GO %s\n", escape(allstr[i], "?"));
    }

  for(set<string>::iterator x=s.begin(); x != s.end(); x++) {
    bool b = false;
    for(int i=0; i<ca; i++) if(allstr[i] == *x) b = true;
    if(!b) printf("#warning TO %s\n", escape(*x, "?"));
    }
#endif

  for(int i=1; i<NUMLAN; i++) {
    printletters(d[i], nouns[i], "SOMETHING");
    }

  int c =0;
  string javastring;
  vector<string> vchars;
//printf("ALL:");
  for(set<string>::iterator it = allchars.begin(); it != allchars.end(); it++) {
//  printf(" \"%s\",", it->c_str());
    if(size(*it) >= 2) { javastring += (*it); vchars.push_back(*it); c++; }
    }
  printf("\n");
  printf("#define NUMEXTRA %d\n", c);
  printf("#define NATCHARS {");
  for(int i=0; i<c; i++) printf("\"%s\",", vchars[i].c_str());
  printf("};\n");
  printf("const char* natchars[NUMEXTRA] = NATCHARS;");
  printf("//javastring = \"%s\";\n", javastring.c_str());

  printf("\nint transcompleteness[NUMLAN] = {");
  for(int i=0; i<NUMLAN; i++) printf("%d, ", completeness[i]);
  printf("};\n");

  for(int i=1; i<NUMLAN; i++)
    for(map<string,string>::iterator it = d[i].m.begin(); it != d[i].m.end(); it++)
      s.insert(it->first);

  printf("\n//statistics\n");
  for(map<string, string>::iterator it = d[1].m.begin(); it != d[1].m.end(); it++)
    d[0][it->first] = it->first;
  for(map<string, noun>::iterator it = nouns[1].m.begin(); it != nouns[1].m.end(); it++) {
    noun n = it->second;
    n.nom = n.nomp = n.acc = n.abl = it->first;
    nouns[0][it->first] = n;
    }

  printf("// total: %5d nouns, %5d sentences\n", int(nouns[1].m.size()), int(d[1].m.size()));

  for(int i=0; i<NUMLAN; i++) {
    int bnouns = 0;
    int dict = 0;

    for(map<string, string>::iterator it = d[i].m.begin(); it != d[i].m.end(); it++)
      dict += it->second.size();
    for(map<string, noun>::iterator it = nouns[i].m.begin(); it != nouns[i].m.end(); it++) {
      noun& n = it->second;
      bnouns += n.nom.size();
      bnouns += n.nomp.size();
      bnouns += n.acc.size();
      bnouns += n.abl.size();
      }

    printf("// %s: %5dB nouns, %5dB sentences\n",
      d[i]["EN"].c_str(), bnouns, dict);
    }

  set<string> allsent;
  for(map<string, string>::iterator it = d[1].m.begin(); it != d[1].m.end(); it++)
    allsent.insert(it->first);

  set<string> allnouns;
  for(map<string, noun>::iterator it = nouns[1].m.begin(); it != nouns[1].m.end(); it++)
    allnouns.insert(it->first);

  map<hashcode, string> ms, mn;

  do {
    hashval = rand();
    printf("// check hash: %x\n", hashval);
    ms = buildHashTable(allsent);
    mn = buildHashTable(allnouns);
    }
  while(size(ms) != size(allsent) || size(mn) != size(allnouns));

  printf("hashcode hashval = 0x%x;\n\n", hashval);

  printf("sentence all_sentences[] = {\n");

  for(map<hashcode,string>::iterator it = ms.begin(); it != ms.end(); it++) {
    string s = it->second;
    if(isrepeat(s)) printf("#if REPEATED\n");
    printf("  {0x%x, { // %s\n", it->first, escape(s, s));
    for(int i=1; i<NUMLAN; i++) printf("   %s,\n", escape(d[i][s], s));
    printf("    }},\n");
    if(isrepeat(s)) printf("#endif\n");
    }
  printf("  };\n\n");

  printf("fullnoun all_nouns[] = {\n");

  for(map<hashcode,string>::iterator it = mn.begin(); it != mn.end(); it++) {
    string s = it->second;
    if(isrepeat(s)) printf("#if REPEATED\n");
    printf("  {0x%x, %d, { // \"%s\"\n", it->first,
      (nothe.count(s) ? 1:0) + (plural.count(s) ? 2:0),
      escape(s, s));

    for(int i=1; i<NUMLAN; i++) {
      printf("    {%d", nouns[i][s].genus);
      printf(", %s", escape(nouns[i][s].nom, s));
      printf(", %s", escape(nouns[i][s].nomp, s));
      printf(", %s", escape(nouns[i][s].acc, s));
      printf(", %s},\n", escape(nouns[i][s].abl, s));
      }

    printf("    }},\n");
    if(isrepeat(s)) printf("#endif\n");
    }

  printf("  };\n");

  }