/* $Id$ */

/** @file strgen.cpp Tool to create computer readable (stand-alone) translation files. */

#include "../stdafx.h"
#include "../core/alloc_func.hpp"
#include "../core/endian_func.hpp"
#include "../string_func.h"
#include "../strings_type.h"
#include "strgen.h"
#include "../table/control_codes.h"

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <stdarg.h>

#if (!defined(WIN32) && !defined(WIN64)) || defined(__CYGWIN__)
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#endif

#if defined WIN32 || defined __WATCOMC__
#include <direct.h>
#endif /* WIN32 || __WATCOMC__ */

#ifdef __MORPHOS__
#ifdef stderr
#undef stderr
#endif
#define stderr stdout
#endif /* __MORPHOS__ */

/* Compiles a list of strings into a compiled string list */

typedef void (*ParseCmdProc)(char *buf, int value);

struct CmdStruct {
	const char *cmd;
	ParseCmdProc proc;
	long value;
	int8 consumes;
	byte flags;
};

enum {
	C_DONTCOUNT = 1,
	C_CASE      = 2,
};


struct Case {
	int caseidx;
	char *string;
	Case *next;
};

static bool _masterlang;
static bool _translated;
static const char *_file = "(unknown file)";
static int _cur_line;
static int _errors, _warnings, _show_todo;

struct LangString {
	char *name;            // Name of the string
	char *english;         // English text
	char *translated;      // Translated text
	uint16 hash_next;      // next hash entry
	uint16 index;
	int line;              // line of string in source-file
	Case *english_case;    // cases for english
	Case *translated_case; // cases for foreign
};

static LangString *_strings[65536];


#define HASH_SIZE 32767
static uint16 _hash_head[HASH_SIZE];

static byte _put_buf[4096];
static int _put_pos;
static int _next_string_id;

static uint32 _hash;
static char _lang_name[32], _lang_ownname[32], _lang_isocode[16];
static byte _lang_pluralform;
static byte _lang_textdir;
static uint16 _lang_winlangid;
static uint8 _lang_newgrflangid;
#define MAX_NUM_GENDER 8
static char _genders[MAX_NUM_GENDER][16];
static uint _numgenders;

/* contains the name of all cases. */
#define MAX_NUM_CASES 50
static char _cases[MAX_NUM_CASES][16];
static uint _numcases;

/* for each plural value, this is the number of plural forms. */
static const byte _plural_form_counts[] = { 2, 1, 2, 3, 3, 3, 3, 3, 4, 2, 3 };

static const char *_cur_ident;

struct CmdPair {
	const CmdStruct *a;
	const char *v;
};

struct ParsedCommandStruct {
	uint np;
	CmdPair pairs[32];
	const CmdStruct *cmd[32]; // ordered by param #
};

/* Used when generating some advanced commands. */
static ParsedCommandStruct _cur_pcs;
static int _cur_argidx;

static uint HashStr(const char *s)
{
	uint hash = 0;
	for (; *s != '\0'; s++) hash = ROL(hash, 3) ^ *s;
	return hash % HASH_SIZE;
}

static void HashAdd(const char *s, LangString *ls)
{
	uint hash = HashStr(s);
	ls->hash_next = _hash_head[hash];
	_hash_head[hash] = ls->index + 1;
}

static LangString *HashFind(const char *s)
{
	int idx = _hash_head[HashStr(s)];

	while (--idx >= 0) {
		LangString *ls = _strings[idx];

		if (strcmp(ls->name, s) == 0) return ls;
		idx = ls->hash_next;
	}
	return NULL;
}

#ifdef _MSC_VER
# define LINE_NUM_FMT "(%d)"
#else
# define LINE_NUM_FMT ":%d"
#endif

static void CDECL strgen_warning(const char *s, ...)
{
	char buf[1024];
	va_list va;
	va_start(va, s);
	vsnprintf(buf, lengthof(buf), s, va);
	va_end(va);
	fprintf(stderr, "%s" LINE_NUM_FMT ": warning: %s\n", _file, _cur_line, buf);
	_warnings++;
}

static void CDECL strgen_error(const char *s, ...)
{
	char buf[1024];
	va_list va;
	va_start(va, s);
	vsnprintf(buf, lengthof(buf), s, va);
	va_end(va);
	fprintf(stderr, "%s" LINE_NUM_FMT ": error: %s\n", _file, _cur_line, buf);
	_errors++;
}

void NORETURN CDECL error(const char *s, ...)
{
	char buf[1024];
	va_list va;
	va_start(va, s);
	vsnprintf(buf, lengthof(buf), s, va);
	va_end(va);
	fprintf(stderr, "%s" LINE_NUM_FMT ": FATAL: %s\n", _file, _cur_line, buf);
	exit(1);
}

static void PutByte(byte c)
{
	if (_put_pos == lengthof(_put_buf)) error("Put buffer too small");
	_put_buf[_put_pos++] = c;
}


static void PutUtf8(uint32 value)
{
	if (value < 0x80) {
		PutByte(value);
	} else if (value < 0x800) {
		PutByte(0xC0 + GB(value,  6, 5));
		PutByte(0x80 + GB(value,  0, 6));
	} else if (value < 0x10000) {
		PutByte(0xE0 + GB(value, 12, 4));
		PutByte(0x80 + GB(value,  6, 6));
		PutByte(0x80 + GB(value,  0, 6));
	} else if (value < 0x110000) {
		PutByte(0xF0 + GB(value, 18, 3));
		PutByte(0x80 + GB(value, 12, 6));
		PutByte(0x80 + GB(value,  6, 6));
		PutByte(0x80 + GB(value,  0, 6));
	} else {
		strgen_warning("Invalid unicode value U+0x%X", value);
	}
}


size_t Utf8Validate(const char *s)
{
	uint32 c;

	if (!HasBit(s[0], 7)) {
		/* 1 byte */
		return 1;
	} else if (GB(s[0], 5, 3) == 6 && IsUtf8Part(s[1])) {
		/* 2 bytes */
		c = GB(s[0], 0, 5) << 6 | GB(s[1], 0, 6);
		if (c >= 0x80) return 2;
	} else if (GB(s[0], 4, 4) == 14 && IsUtf8Part(s[1]) && IsUtf8Part(s[2])) {
		/* 3 bytes */
		c = GB(s[0], 0, 4) << 12 | GB(s[1], 0, 6) << 6 | GB(s[2], 0, 6);
		if (c >= 0x800) return 3;
	} else if (GB(s[0], 3, 5) == 30 && IsUtf8Part(s[1]) && IsUtf8Part(s[2]) && IsUtf8Part(s[3])) {
		/* 4 bytes */
		c = GB(s[0], 0, 3) << 18 | GB(s[1], 0, 6) << 12 | GB(s[2], 0, 6) << 6 | GB(s[3], 0, 6);
		if (c >= 0x10000 && c <= 0x10FFFF) return 4;
	}

	return 0;
}


static void EmitSingleChar(char *buf, int value)
{
	if (*buf != '\0') strgen_warning("Ignoring trailing letters in command");
	PutUtf8(value);
}


static void EmitSetX(char *buf, int value)
{
	char *err;
	int x = strtol(buf, &err, 0);
	if (*err != 0) error("SetX param invalid");
	PutUtf8(SCC_SETX);
	PutByte((byte)x);
}


static void EmitSetXY(char *buf, int value)
{
	char *err;
	int x;
	int y;

	x = strtol(buf, &err, 0);
	if (*err != ' ') error("SetXY param invalid");
	y = strtol(err + 1, &err, 0);
	if (*err != 0) error("SetXY param invalid");

	PutUtf8(SCC_SETXY);
	PutByte((byte)x);
	PutByte((byte)y);
}

/* The plural specifier looks like
 * {NUM} {PLURAL -1 passenger passengers} then it picks either passenger/passengers depending on the count in NUM */

/* This is encoded like
 *  CommandByte <ARG#> <NUM> {Length of each string} {each string} */

bool ParseRelNum(char **buf, int *value)
{
	const char *s = *buf;
	char *end;
	bool rel = false;
	int v;

	while (*s == ' ' || *s == '\t') s++;
	if (*s == '+') {
		rel = true;
		s++;
	}
	v = strtol(s, &end, 0);
	if (end == s) return false;
	if (rel || v < 0) {
		*value += v;
	} else {
		*value = v;
	}
	*buf = end;
	return true;
}

/* Parse out the next word, or NULL */
char *ParseWord(char **buf)
{
	char *s = *buf, *r;

	while (*s == ' ' || *s == '\t') s++;
	if (*s == '\0') return NULL;

	if (*s == '"') {
		r = ++s;
		/* parse until next " or NUL */
		for (;;) {
			if (*s == '\0') break;
			if (*s == '"') {
				*s++ = '\0';
				break;
			}
			s++;
		}
	} else {
		/* proceed until whitespace or NUL */
		r = s;
		for (;;) {
			if (*s == '\0') break;
			if (*s == ' ' || *s == '\t') {
				*s++ = '\0';
				break;
			}
			s++;
		}
	}
	*buf = s;
	return r;
}

/* Forward declaration */
static int TranslateArgumentIdx(int arg);

static void EmitWordList(const char * const *words, uint nw)
{
	uint i;
	uint j;

	PutByte(nw);
	for (i = 0; i < nw; i++) PutByte(strlen(words[i]));
	for (i = 0; i < nw; i++) {
		for (j = 0; words[i][j] != '\0'; j++) PutByte(words[i][j]);
	}
}

static void EmitPlural(char *buf, int value)
{
	int argidx = _cur_argidx;
	const char *words[5];
	int nw = 0;

	/* Parse out the number, if one exists. Otherwise default to prev arg. */
	if (!ParseRelNum(&buf, &argidx)) argidx--;

	/* Parse each string */
	for (nw = 0; nw < 5; nw++) {
		words[nw] = ParseWord(&buf);
		if (words[nw] == NULL) break;
	}

	if (nw == 0)
		error("%s: No plural words", _cur_ident);

	if (_plural_form_counts[_lang_pluralform] != nw) {
		if (_translated) {
			error("%s: Invalid number of plural forms. Expecting %d, found %d.", _cur_ident,
				_plural_form_counts[_lang_pluralform], nw);
		} else {
			if ((_show_todo & 2) != 0) strgen_warning("'%s' is untranslated. Tweaking english string to allow compilation for plural forms", _cur_ident);
			if (nw > _plural_form_counts[_lang_pluralform]) {
				nw = _plural_form_counts[_lang_pluralform];
			} else {
				for (; nw < _plural_form_counts[_lang_pluralform]; nw++) {
					words[nw] = words[nw - 1];
				}
			}
		}
	}

	PutUtf8(SCC_PLURAL_LIST);
	PutByte(TranslateArgumentIdx(argidx));
	EmitWordList(words, nw);
}


static void EmitGender(char *buf, int value)
{
	int argidx = _cur_argidx;
	uint nw;

	if (buf[0] == '=') {
		buf++;

		/* This is a {G=DER} command */
		for (nw = 0; ; nw++) {
			if (nw >= 8) error("G argument '%s' invalid", buf);
			if (strcmp(buf, _genders[nw]) == 0) break;
		}
		/* now nw contains the gender index */
		PutUtf8(SCC_GENDER_INDEX);
		PutByte(nw);
	} else {
		const char *words[8];

		/* This is a {G 0 foo bar two} command.
		 * If no relative number exists, default to +0 */
		if (!ParseRelNum(&buf, &argidx)) {}

		for (nw = 0; nw < 8; nw++) {
			words[nw] = ParseWord(&buf);
			if (words[nw] == NULL) break;
		}
		if (nw != _numgenders) error("Bad # of arguments for gender command");
		PutUtf8(SCC_GENDER_LIST);
		PutByte(TranslateArgumentIdx(argidx));
		EmitWordList(words, nw);
	}
}


static const CmdStruct _cmd_structs[] = {
	/* Update position */
	{"SETX",  EmitSetX,  SCC_SETX,  0, 0},
	{"SETXY", EmitSetXY, SCC_SETXY, 0, 0},

	/* Font size */
	{"TINYFONT", EmitSingleChar, SCC_TINYFONT, 0, 0},
	{"BIGFONT",  EmitSingleChar, SCC_BIGFONT,  0, 0},

	/* Colors */
	{"BLUE",    EmitSingleChar, SCC_BLUE,    0, 0},
	{"SILVER",  EmitSingleChar, SCC_SILVER,  0, 0},
	{"GOLD",    EmitSingleChar, SCC_GOLD,    0, 0},
	{"RED",     EmitSingleChar, SCC_RED,     0, 0},
	{"PURPLE",  EmitSingleChar, SCC_PURPLE,  0, 0},
	{"LTBROWN", EmitSingleChar, SCC_LTBROWN, 0, 0},
	{"ORANGE",  EmitSingleChar, SCC_ORANGE,  0, 0},
	{"GREEN",   EmitSingleChar, SCC_GREEN,   0, 0},
	{"YELLOW",  EmitSingleChar, SCC_YELLOW,  0, 0},
	{"DKGREEN", EmitSingleChar, SCC_DKGREEN, 0, 0},
	{"CREAM",   EmitSingleChar, SCC_CREAM,   0, 0},
	{"BROWN",   EmitSingleChar, SCC_BROWN,   0, 0},
	{"WHITE",   EmitSingleChar, SCC_WHITE,   0, 0},
	{"LTBLUE",  EmitSingleChar, SCC_LTBLUE,  0, 0},
	{"GRAY",    EmitSingleChar, SCC_GRAY,    0, 0},
	{"DKBLUE",  EmitSingleChar, SCC_DKBLUE,  0, 0},
	{"BLACK",   EmitSingleChar, SCC_BLACK,   0, 0},

	{"CURRCOMPACT",   EmitSingleChar, SCC_CURRENCY_COMPACT,    1, 0}, // compact currency
	{"REV",           EmitSingleChar, SCC_REVISION,            0, 0}, // openttd revision string
	{"SHORTCARGO",    EmitSingleChar, SCC_CARGO_SHORT,         2, 0}, // short cargo description, only ### tons, or ### litres

	{"STRING1", EmitSingleChar, SCC_STRING1, 2, C_CASE}, // included string that consumes the string id and ONE argument
	{"STRING2", EmitSingleChar, SCC_STRING2, 3, C_CASE}, // included string that consumes the string id and TWO arguments
	{"STRING3", EmitSingleChar, SCC_STRING3, 4, C_CASE}, // included string that consumes the string id and THREE arguments
	{"STRING4", EmitSingleChar, SCC_STRING4, 5, C_CASE}, // included string that consumes the string id and FOUR arguments
	{"STRING5", EmitSingleChar, SCC_STRING5, 6, C_CASE}, // included string that consumes the string id and FIVE arguments

	{"STATIONFEATURES", EmitSingleChar, SCC_STATION_FEATURES, 1, 0}, // station features string, icons of the features
	{"INDUSTRY",        EmitSingleChar, SCC_INDUSTRY_NAME,    1, 0}, // industry, takes an industry #
	{"CARGO",           EmitSingleChar, SCC_CARGO,            2, 0},
	{"POWER",           EmitSingleChar, SCC_POWER,            1, 0},
	{"VOLUME",          EmitSingleChar, SCC_VOLUME,           1, 0},
	{"VOLUME_S",        EmitSingleChar, SCC_VOLUME_SHORT,     1, 0},
	{"WEIGHT",          EmitSingleChar, SCC_WEIGHT,           1, 0},
	{"WEIGHT_S",        EmitSingleChar, SCC_WEIGHT_SHORT,     1, 0},
	{"FORCE",           EmitSingleChar, SCC_FORCE,            1, 0},
	{"VELOCITY",        EmitSingleChar, SCC_VELOCITY,         1, 0},

	{"P", EmitPlural, 0, 0, C_DONTCOUNT}, // plural specifier
	{"G", EmitGender, 0, 0, C_DONTCOUNT}, // gender specifier

	{"DATE_TINY",  EmitSingleChar, SCC_DATE_TINY, 1, 0},
	{"DATE_SHORT", EmitSingleChar, SCC_DATE_SHORT, 1, 0},
	{"DATE_LONG",  EmitSingleChar, SCC_DATE_LONG, 1, 0},
	{"DATE_ISO",   EmitSingleChar, SCC_DATE_ISO, 1, 0},

	{"SKIP", EmitSingleChar, SCC_SKIP, 1, 0},

	{"STRING", EmitSingleChar, SCC_STRING, 1, C_CASE},
	{"RAW_STRING", EmitSingleChar, SCC_RAW_STRING_POINTER, 1, 0},

	/* Numbers */
	{"COMMA", EmitSingleChar, SCC_COMMA, 1, 0}, // Number with comma
	{"NUM",   EmitSingleChar, SCC_NUM,   1, 0}, // Signed number
	{"BYTES", EmitSingleChar, SCC_BYTES, 1, 0}, // Unsigned number with "bytes", i.e. "1.02 MiB or 123 KiB"

	{"CURRENCY",   EmitSingleChar, SCC_CURRENCY,    1, 0},

	{"WAYPOINT", EmitSingleChar, SCC_WAYPOINT_NAME, 1, 0}, // waypoint name
	{"STATION",  EmitSingleChar, SCC_STATION_NAME,  1, 0},
	{"TOWN",     EmitSingleChar, SCC_TOWN_NAME,     1, 0},
	{"GROUP",    EmitSingleChar, SCC_GROUP_NAME,    1, 0},
	{"SIGN",     EmitSingleChar, SCC_SIGN_NAME,     1, 0},
	{"ENGINE",   EmitSingleChar, SCC_ENGINE_NAME,   1, 0},
	{"VEHICLE",  EmitSingleChar, SCC_VEHICLE_NAME,  1, 0},
	{"COMPANY",  EmitSingleChar, SCC_COMPANY_NAME,  1, 0},
	{"COMPANYNUM", EmitSingleChar, SCC_COMPANY_NUM, 1, 0},
	{"PRESIDENTNAME", EmitSingleChar, SCC_PRESIDENT_NAME, 1, 0},

	// 0x9D is used for the pseudo command SETCASE
	// 0x9E is used for case switching

	{"",               EmitSingleChar, '\n',               0, C_DONTCOUNT},
	{"{",              EmitSingleChar, '{',                0, C_DONTCOUNT},
	{"UPARROW",        EmitSingleChar, SCC_UPARROW,        0, 0},
	{"SMALLUPARROW",   EmitSingleChar, SCC_SMALLUPARROW,   0, 0},
	{"SMALLDOWNARROW", EmitSingleChar, SCC_SMALLDOWNARROW, 0, 0},
	{"TRAIN",          EmitSingleChar, SCC_TRAIN,          0, 0},
	{"LORRY",          EmitSingleChar, SCC_LORRY,          0, 0},
	{"BUS",            EmitSingleChar, SCC_BUS,            0, 0},
	{"PLANE",          EmitSingleChar, SCC_PLANE,          0, 0},
	{"SHIP",           EmitSingleChar, SCC_SHIP,           0, 0},
	{"NBSP",           EmitSingleChar, 0xA0,               0, C_DONTCOUNT},
	{"CENT",           EmitSingleChar, 0xA2,               0, C_DONTCOUNT},
	{"POUNDSIGN",      EmitSingleChar, 0xA3,               0, C_DONTCOUNT},
	{"EURO",           EmitSingleChar, 0x20AC,             0, C_DONTCOUNT},
	{"YENSIGN",        EmitSingleChar, 0xA5,               0, C_DONTCOUNT},
	{"COPYRIGHT",      EmitSingleChar, 0xA9,               0, C_DONTCOUNT},
	{"DOWNARROW",      EmitSingleChar, SCC_DOWNARROW,      0, C_DONTCOUNT},
	{"CHECKMARK",      EmitSingleChar, SCC_CHECKMARK,      0, C_DONTCOUNT},
	{"CROSS",          EmitSingleChar, SCC_CROSS,          0, C_DONTCOUNT},
	{"REGISTERED",     EmitSingleChar, 0xAE,               0, C_DONTCOUNT},
	{"RIGHTARROW",     EmitSingleChar, SCC_RIGHTARROW,     0, C_DONTCOUNT},
	{"SMALLLEFTARROW", EmitSingleChar, SCC_LESSTHAN,       0, C_DONTCOUNT},
	{"SMALLRIGHTARROW",EmitSingleChar, SCC_GREATERTHAN,    0, C_DONTCOUNT},

	/* The following are directional formatting codes used to get the RTL strings right:
	 * http://www.unicode.org/unicode/reports/tr9/#Directional_Formatting_Codes */
	{"LRM",            EmitSingleChar, 0x200E,             0, C_DONTCOUNT},
	{"RLM",            EmitSingleChar, 0x200F,             0, C_DONTCOUNT},
	{"LRE",            EmitSingleChar, 0x202A,             0, C_DONTCOUNT},
	{"RLE",            EmitSingleChar, 0x202B,             0, C_DONTCOUNT},
	{"LRO",            EmitSingleChar, 0x202D,             0, C_DONTCOUNT},
	{"RLO",            EmitSingleChar, 0x202E,             0, C_DONTCOUNT},
	{"PDF",            EmitSingleChar, 0x202C,             0, C_DONTCOUNT},
};


static const CmdStruct *FindCmd(const char *s, int len)
{
	const CmdStruct *cs;

	for (cs = _cmd_structs; cs != endof(_cmd_structs); cs++) {
		if (strncmp(cs->cmd, s, len) == 0 && cs->cmd[len] == '\0') return cs;
	}
	return NULL;
}

static uint ResolveCaseName(const char *str, uint len)
{
	uint i;

	for (i = 0; i < MAX_NUM_CASES; i++) {
		if (memcmp(_cases[i], str, len) == 0 && _cases[i][len] == 0) return i + 1;
	}
	error("Invalid case-name '%s'", str);
}


/* returns NULL on eof
 * else returns command struct */
static const CmdStruct *ParseCommandString(const char **str, char *param, int *argno, int *casei)
{
	const char *s = *str, *start;
	const CmdStruct *cmd;
	byte c;

	*argno = -1;
	*casei = -1;

	/* Scan to the next command, exit if there's no next command. */
	for (; *s != '{'; s++) {
		if (*s == '\0') return NULL;
	}
	s++; // Skip past the {

	if (*s >= '0' && *s <= '9') {
		char *end;

		*argno = strtoul(s, &end, 0);
		if (*end != ':') error("missing arg #");
		s = end + 1;
	}

	/* parse command name */
	start = s;
	do {
		c = *s++;
	} while (c != '}' && c != ' ' && c != '=' && c != '.' && c != 0);

	cmd = FindCmd(start, s - start - 1);
	if (cmd == NULL) {
		strgen_error("Undefined command '%.*s'", s - start - 1, start);
		return NULL;
	}

	if (c == '.') {
		const char *casep = s;

		if (!(cmd->flags & C_CASE))
			error("Command '%s' can't have a case", cmd->cmd);

		do c = *s++; while (c != '}' && c != ' ' && c != '\0');
		*casei = ResolveCaseName(casep, s - casep - 1);
	}

	if (c == '\0') {
		strgen_error("Missing } from command '%s'", start);
		return NULL;
	}


	if (c != '}') {
		if (c == '=') s--;
		/* copy params */
		start = s;
		for (;;) {
			c = *s++;
			if (c == '}') break;
			if (c == '\0') {
				strgen_error("Missing } from command '%s'", start);
				return NULL;
			}
			if (s - start == 250) error("param command too long");
			*param++ = c;
		}
	}
	*param = '\0';

	*str = s;

	return cmd;
}


static void HandlePragma(char *str)
{
	if (!memcmp(str, "id ", 3)) {
		_next_string_id = strtoul(str + 3, NULL, 0);
	} else if (!memcmp(str, "name ", 5)) {
		strecpy(_lang_name, str + 5, lastof(_lang_name));
	} else if (!memcmp(str, "ownname ", 8)) {
		strecpy(_lang_ownname, str + 8, lastof(_lang_ownname));
	} else if (!memcmp(str, "isocode ", 8)) {
		strecpy(_lang_isocode, str + 8, lastof(_lang_isocode));
	} else if (!memcmp(str, "plural ", 7)) {
		_lang_pluralform = atoi(str + 7);
		if (_lang_pluralform >= lengthof(_plural_form_counts))
			error("Invalid pluralform %d", _lang_pluralform);
	} else if (!memcmp(str, "textdir ", 8)) {
		if (!memcmp(str + 8, "ltr", 3)) {
			_lang_textdir = TD_LTR;
		} else if (!memcmp(str + 8, "rtl", 3)) {
			_lang_textdir = TD_RTL;
		} else {
			error("Invalid textdir %s", str + 8);
		}
	} else if (!memcmp(str, "winlangid ", 10)) {
		const char *buf = str + 10;
		long langid = strtol(buf, NULL, 16);
		if (langid > UINT16_MAX || langid < 0) {
			error("Invalid winlangid %s", buf);
		}
		_lang_winlangid = (uint16)langid;
	} else if (!memcmp(str, "grflangid ", 10)) {
		const char *buf = str + 10;
		long langid = strtol(buf, NULL, 16);
		if (langid >= 0x7F || langid < 0) {
			error("Invalid grflangid %s", buf);
		}
		_lang_newgrflangid = (uint8)langid;
	} else if (!memcmp(str, "gender ", 7)) {
		char *buf = str + 7;

		for (;;) {
			const char *s = ParseWord(&buf);

			if (s == NULL) break;
			if (_numgenders >= MAX_NUM_GENDER) error("Too many genders, max %d", MAX_NUM_GENDER);
			strecpy(_genders[_numgenders], s, lastof(_genders[_numgenders]));
			_numgenders++;
		}
	} else if (!memcmp(str, "case ", 5)) {
		char *buf = str + 5;

		for (;;) {
			const char *s = ParseWord(&buf);

			if (s == NULL) break;
			if (_numcases >= MAX_NUM_CASES) error("Too many cases, max %d", MAX_NUM_CASES);
			strecpy(_cases[_numcases], s, lastof(_cases[_numcases]));
			_numcases++;
		}
	} else {
		error("unknown pragma '%s'", str);
	}
}

static void ExtractCommandString(ParsedCommandStruct *p, const char *s, bool warnings)
{
	char param[100];
	int argno;
	int argidx = 0;
	int casei;

	memset(p, 0, sizeof(*p));

	for (;;) {
		/* read until next command from a. */
		const CmdStruct *ar = ParseCommandString(&s, param, &argno, &casei);

		if (ar == NULL) break;

		/* Sanity checking */
		if (argno != -1 && ar->consumes == 0) error("Non consumer param can't have a paramindex");

		if (ar->consumes) {
			if (argno != -1) argidx = argno;
			if (argidx < 0 || (uint)argidx >= lengthof(p->cmd)) error("invalid param idx %d", argidx);
			if (p->cmd[argidx] != NULL && p->cmd[argidx] != ar) error("duplicate param idx %d", argidx);

			p->cmd[argidx++] = ar;
		} else if (!(ar->flags & C_DONTCOUNT)) { // Ignore some of them
			if (p->np >= lengthof(p->pairs)) error("too many commands in string, max %d", lengthof(p->pairs));
			p->pairs[p->np].a = ar;
			p->pairs[p->np].v = param[0] != '\0' ? strdup(param) : "";
			p->np++;
		}
	}
}


static const CmdStruct *TranslateCmdForCompare(const CmdStruct *a)
{
	if (a == NULL) return NULL;

	if (strcmp(a->cmd, "STRING1") == 0 ||
			strcmp(a->cmd, "STRING2") == 0 ||
			strcmp(a->cmd, "STRING3") == 0 ||
			strcmp(a->cmd, "STRING4") == 0 ||
			strcmp(a->cmd, "STRING5") == 0 ||
			strcmp(a->cmd, "RAW_STRING") == 0){
		return FindCmd("STRING", 6);
	}

	if (strcmp(a->cmd, "SKIP") == 0) return NULL;

	return a;
}


static bool CheckCommandsMatch(char *a, char *b, const char *name)
{
	ParsedCommandStruct templ;
	ParsedCommandStruct lang;
	uint i, j;
	bool result = true;

	ExtractCommandString(&templ, b, true);
	ExtractCommandString(&lang, a, true);

	/* For each string in templ, see if we find it in lang */
	if (templ.np != lang.np) {
		strgen_warning("%s: template string and language string have a different # of commands", name);
		result = false;
	}

	for (i = 0; i < templ.np; i++) {
		/* see if we find it in lang, and zero it out */
		bool found = false;
		for (j = 0; j < lang.np; j++) {
			if (templ.pairs[i].a == lang.pairs[j].a &&
					strcmp(templ.pairs[i].v, lang.pairs[j].v) == 0) {
				/* it was found in both. zero it out from lang so we don't find it again */
				lang.pairs[j].a = NULL;
				found = true;
				break;
			}
		}

		if (!found) {
			strgen_warning("%s: command '%s' exists in template file but not in language file", name, templ.pairs[i].a->cmd);
			result = false;
		}
	}

	/* if we reach here, all non consumer commands match up.
	 * Check if the non consumer commands match up also. */
	for (i = 0; i < lengthof(templ.cmd); i++) {
		if (TranslateCmdForCompare(templ.cmd[i]) != TranslateCmdForCompare(lang.cmd[i])) {
			strgen_warning("%s: Param idx #%d '%s' doesn't match with template command '%s'", name, i,
				lang.cmd[i]  == NULL ? "<empty>" : lang.cmd[i]->cmd,
				templ.cmd[i] == NULL ? "<empty>" : templ.cmd[i]->cmd);
			result = false;
		}
	}

	return result;
}

static void HandleString(char *str, bool master)
{
	char *s, *t;
	LangString *ent;
	char *casep;

	if (*str == '#') {
		if (str[1] == '#' && str[2] != '#') HandlePragma(str + 2);
		return;
	}

	/* Ignore comments & blank lines */
	if (*str == ';' || *str == ' ' || *str == '\0') return;

	s = strchr(str, ':');
	if (s == NULL) {
		strgen_error("Line has no ':' delimiter");
		return;
	}

	/* Trim spaces.
	 * After this str points to the command name, and s points to the command contents */
	for (t = s; t > str && (t[-1] == ' ' || t[-1] == '\t'); t--);
	*t = 0;
	s++;

	/* Check string is valid UTF-8 */
	{
		const char *tmp;
		for (tmp = s; *tmp != '\0';) {
			size_t len = Utf8Validate(tmp);
			if (len == 0) error("Invalid UTF-8 sequence in '%s'", s);
			tmp += len;
		}
	}

	/* Check if the string has a case..
	 * The syntax for cases is IDENTNAME.case */
	casep = strchr(str, '.');
	if (casep) *casep++ = 0;

	/* Check if this string already exists.. */
	ent = HashFind(str);

	if (master) {
		if (ent != NULL && casep == NULL) {
			strgen_error("String name '%s' is used multiple times", str);
			return;
		}

		if (ent == NULL && casep != NULL) {
			strgen_error("Base string name '%s' doesn't exist yet. Define it before defining a case.", str);
			return;
		}

		if (ent == NULL) {
			if (_strings[_next_string_id]) {
				strgen_error("String ID 0x%X for '%s' already in use by '%s'", ent, str, _strings[_next_string_id]->name);
				return;
			}

			/* Allocate a new LangString */
			ent = CallocT<LangString>(1);
			_strings[_next_string_id] = ent;
			ent->index = _next_string_id++;
			ent->name = strdup(str);
			ent->line = _cur_line;

			HashAdd(str, ent);
		}

		if (casep != NULL) {
			Case *c = MallocT<Case>(1);

			c->caseidx = ResolveCaseName(casep, strlen(casep));
			c->string = strdup(s);
			c->next = ent->english_case;
			ent->english_case = c;
		} else {
			ent->english = strdup(s);
		}

	} else {
		if (ent == NULL) {
			strgen_warning("String name '%s' does not exist in master file", str);
			return;
		}

		if (ent->translated && casep == NULL) {
			strgen_error("String name '%s' is used multiple times", str);
			return;
		}

		if (s[0] == ':' && s[1] == '\0' && casep == NULL) {
			/* Special syntax :: means we should just inherit the master string */
			ent->translated = strdup(ent->english);
		} else {
			/* make sure that the commands match */
			if (!CheckCommandsMatch(s, ent->english, str)) return;

			if (casep != NULL) {
				Case *c = MallocT<Case>(1);

				c->caseidx = ResolveCaseName(casep, strlen(casep));
				c->string = strdup(s);
				c->next = ent->translated_case;
				ent->translated_case = c;
			} else {
				ent->translated = strdup(s);
			}
		}
	}
}


static void rstrip(char *buf)
{
	int i = strlen(buf);
	while (i > 0 && (buf[i - 1] == '\r' || buf[i - 1] == '\n' || buf[i - 1] == ' ')) i--;
	buf[i] = '\0';
}


static void ParseFile(const char *file, bool english)
{
	FILE *in;
	char buf[2048];

	_file = file;

	/* For each new file we parse, reset the genders, and language codes */
	_numgenders = 0;
	_lang_name[0] = _lang_ownname[0] = _lang_isocode[0] = '\0';
	_lang_textdir = TD_LTR;
	_lang_winlangid = 0x0000; // neutral language code
	_lang_newgrflangid = 0; // standard english
	/* TODO:!! We can't reset the cases. In case the translated strings
	 * derive some strings from english.... */

	in = fopen(file, "r");
	if (in == NULL) error("Cannot open file");
	_cur_line = 1;
	while (fgets(buf, sizeof(buf), in) != NULL) {
		rstrip(buf);
		HandleString(buf, english);
		_cur_line++;
	}
	fclose(in);

	if (StrEmpty(_lang_name) || StrEmpty(_lang_ownname) || StrEmpty(_lang_isocode)) {
		error("Language must include ##name, ##ownname and ##isocode");
	}
}


static uint32 MyHashStr(uint32 hash, const char *s)
{
	for (; *s != '\0'; s++) {
		hash = ROL(hash, 3) ^ *s;
		hash = (hash & 1 ? hash >> 1 ^ 0xDEADBEEF : hash >> 1);
	}
	return hash;
}


/* make a hash of the file to get a unique "version number" */
static void MakeHashOfStrings()
{
	uint32 hash = 0;
	uint i;

	for (i = 0; i != lengthof(_strings); i++) {
		const LangString *ls = _strings[i];

		if (ls != NULL) {
			const CmdStruct *cs;
			const char *s;
			char buf[256];
			int argno;
			int casei;

			s = ls->name;
			hash ^= i * 0x717239;
			hash = (hash & 1 ? hash >> 1 ^ 0xDEADBEEF : hash >> 1);
			hash = MyHashStr(hash, s + 1);

			s = ls->english;
			while ((cs = ParseCommandString(&s, buf, &argno, &casei)) != NULL) {
				if (cs->flags & C_DONTCOUNT) continue;

				hash ^= (cs - _cmd_structs) * 0x1234567;
				hash = (hash & 1 ? hash >> 1 ^ 0xF00BAA4 : hash >> 1);
			}
		}
	}
	_hash = hash;
}


static uint CountInUse(uint grp)
{
	int i;

	for (i = 0x800; --i >= 0;) if (_strings[(grp << 11) + i] != NULL) break;
	return i + 1;
}


bool CompareFiles(const char *n1, const char *n2)
{
	FILE *f1, *f2;
	char b1[4096];
	char b2[4096];
	size_t l1, l2;

	f2 = fopen(n2, "rb");
	if (f2 == NULL) return false;

	f1 = fopen(n1, "rb");
	if (f1 == NULL) error("can't open %s", n1);

	do {
		l1 = fread(b1, 1, sizeof(b1), f1);
		l2 = fread(b2, 1, sizeof(b2), f2);

		if (l1 != l2 || memcmp(b1, b2, l1)) {
			fclose(f2);
			fclose(f1);
			return false;
		}
	} while (l1);

	fclose(f2);
	fclose(f1);
	return true;
}


static void WriteStringsH(const char *filename)
{
	FILE *out;
	int i;
	int next = -1;

	out = fopen("tmp.xxx", "w");
	if (out == NULL) error("can't open tmp.xxx");

	fprintf(out, "/* This file is automatically generated. Do not modify */\n\n");
	fprintf(out, "#ifndef TABLE_STRINGS_H\n");
	fprintf(out, "#define TABLE_STRINGS_H\n");

	for (i = 0; i != lengthof(_strings); i++) {
		if (_strings[i] != NULL) {
			if (next != i) fprintf(out, "\n");
			fprintf(out, "static const StringID %s = 0x%X;\n", _strings[i]->name, i);
			next = i + 1;
		}
	}

	fprintf(out, "\nstatic const StringID STR_LAST_STRINGID = 0x%X;\n", next - 1);

	fprintf(out,
		"\nenum {\n"
		"\tLANGUAGE_PACK_IDENT = 0x474E414C, // Big Endian value for 'LANG' (LE is 0x 4C 41 4E 47)\n"
		"\tLANGUAGE_PACK_VERSION = 0x%X,\n"
		"};\n", (uint)_hash
	);

	fprintf(out, "\n#endif /* TABLE_STRINGS_H */\n");

	fclose(out);

	if (CompareFiles("tmp.xxx", filename)) {
		/* files are equal. tmp.xxx is not needed */
		unlink("tmp.xxx");
	} else {
		/* else rename tmp.xxx into filename */
#if defined(WIN32) || defined(WIN64)
		unlink(filename);
#endif
		if (rename("tmp.xxx", filename) == -1) error("rename() failed");
	}
}

static int TranslateArgumentIdx(int argidx)
{
	int i, sum;

	if (argidx < 0 || (uint)argidx >= lengthof(_cur_pcs.cmd))
		error("invalid argidx %d", argidx);

	for (i = sum = 0; i < argidx; i++) {
		const CmdStruct *cs = _cur_pcs.cmd[i];
		sum += (cs != NULL) ? cs->consumes : 1;
	}

	return sum;
}

static void PutArgidxCommand()
{
	PutUtf8(SCC_ARG_INDEX);
	PutByte(TranslateArgumentIdx(_cur_argidx));
}


static void PutCommandString(const char *str)
{
	const CmdStruct *cs;
	char param[256];
	int argno;
	int casei;

	_cur_argidx = 0;

	while (*str != '\0') {
		/* Process characters as they are until we encounter a { */
		if (*str != '{') {
			PutByte(*str++);
			continue;
		}
		cs = ParseCommandString(&str, param, &argno, &casei);
		if (cs == NULL) break;

		if (casei != -1) {
			PutUtf8(SCC_SETCASE); // {SETCASE}
			PutByte(casei);
		}

		/* For params that consume values, we need to handle the argindex properly */
		if (cs->consumes > 0) {
			/* Check if we need to output a move-param command */
			if (argno != -1 && argno != _cur_argidx) {
				_cur_argidx = argno;
				PutArgidxCommand();
			}

			/* Output the one from the master string... it's always accurate. */
			cs = _cur_pcs.cmd[_cur_argidx++];
			if (cs == NULL) {
				error("%s: No argument exists at position %d", _cur_ident, _cur_argidx - 1);
			}
		}

		cs->proc(param, cs->value);
	}
}

static void WriteLength(FILE *f, uint length)
{
	if (length < 0xC0) {
		fputc(length, f);
	} else if (length < 0x4000) {
		fputc((length >> 8) | 0xC0, f);
		fputc(length & 0xFF, f);
	} else {
		error("string too long");
	}
}


static void WriteLangfile(const char *filename)
{
	FILE *f;
	uint in_use[32];
	LanguagePackHeader hdr;
	uint i;
	uint j;

	f = fopen(filename, "wb");
	if (f == NULL) error("can't open %s", filename);

	memset(&hdr, 0, sizeof(hdr));
	for (i = 0; i != 32; i++) {
		uint n = CountInUse(i);

		in_use[i] = n;
		hdr.offsets[i] = TO_LE16(n);
	}

	/* see line 655: fprintf(..."\tLANGUAGE_PACK_IDENT = 0x474E414C,...) */
	hdr.ident = TO_LE32(0x474E414C); // Big Endian value for 'LANG'
	hdr.version = TO_LE32(_hash);
	hdr.plural_form = _lang_pluralform;
	hdr.text_dir = _lang_textdir;
	hdr.winlangid = TO_LE16(_lang_winlangid);
	hdr.newgrflangid = _lang_newgrflangid;
	strecpy(hdr.name, _lang_name, lastof(hdr.name));
	strecpy(hdr.own_name, _lang_ownname, lastof(hdr.own_name));
	strecpy(hdr.isocode, _lang_isocode, lastof(hdr.isocode));

	fwrite(&hdr, sizeof(hdr), 1, f);

	for (i = 0; i != 32; i++) {
		for (j = 0; j != in_use[i]; j++) {
			const LangString *ls = _strings[(i << 11) + j];
			const Case *casep;
			const char *cmdp;

			/* For undefined strings, just set that it's an empty string */
			if (ls == NULL) {
				WriteLength(f, 0);
				continue;
			}

			_cur_ident = ls->name;
			_cur_line = ls->line;

			/* Produce a message if a string doesn't have a translation. */
			if (_show_todo > 0 && ls->translated == NULL) {
				if ((_show_todo & 2) != 0) {
					strgen_warning("'%s' is untranslated", ls->name);
				}
				if ((_show_todo & 1) != 0) {
					const char *s = "<TODO> ";
					while (*s != '\0') PutByte(*s++);
				}
			}

			/* Extract the strings and stuff from the english command string */
			ExtractCommandString(&_cur_pcs, ls->english, false);

			if (ls->translated_case != NULL || ls->translated != NULL) {
				casep = ls->translated_case;
				cmdp = ls->translated;
			} else {
				casep = ls->english_case;
				cmdp = ls->english;
			}

			_translated = _masterlang || (cmdp != ls->english);

			if (casep != NULL) {
				const Case *c;
				uint num;

				/* Need to output a case-switch.
				 * It has this format
				 * <0x9E> <NUM CASES> <CASE1> <LEN1> <STRING1> <CASE2> <LEN2> <STRING2> <CASE3> <LEN3> <STRING3> <STRINGDEFAULT>
				 * Each LEN is printed using 2 bytes in big endian order. */
				PutUtf8(SCC_SWITCH_CASE);
				/* Count the number of cases */
				for (num = 0, c = casep; c; c = c->next) num++;
				PutByte(num);

				/* Write each case */
				for (c = casep; c != NULL; c = c->next) {
					int pos;

					PutByte(c->caseidx);
					/* Make some space for the 16-bit length */
					pos = _put_pos;
					PutByte(0);
					PutByte(0);
					/* Write string */
					PutCommandString(c->string);
					PutByte(0); // terminate with a zero
					/* Fill in the length */
					_put_buf[pos + 0] = GB(_put_pos - (pos + 2), 8, 8);
					_put_buf[pos + 1] = GB(_put_pos - (pos + 2), 0, 8);
				}
			}

			if (cmdp != NULL) PutCommandString(cmdp);

			WriteLength(f, _put_pos);
			fwrite(_put_buf, 1, _put_pos, f);
			_put_pos = 0;
		}
	}

	fputc(0, f);
	fclose(f);
}

/** Multi-OS mkdirectory function */
static inline void ottd_mkdir(const char *directory)
{
#if defined(WIN32) || defined(__WATCOMC__)
		mkdir(directory);
#else
		mkdir(directory, 0755);
#endif
}

/** Create a path consisting of an already existing path, a possible
 * path seperator and the filename. The seperator is only appended if the path
 * does not already end with a seperator */
static inline char *mkpath(char *buf, size_t buflen, const char *path, const char *file)
{
	char *p;
	ttd_strlcpy(buf, path, buflen); // copy directory into buffer

	p = strchr(buf, '\0'); // add path seperator if necessary
	if (p[-1] != PATHSEPCHAR && (size_t)(p - buf) + 1 < buflen) *p++ = PATHSEPCHAR;
	ttd_strlcpy(p, file, buflen - (size_t)(p - buf)); // catenate filename at end of buffer
	return buf;
}

#if defined(__MINGW32__)
/**
 * On MingW, it is common that both / as \ are accepted in the
 * params. To go with those flow, we rewrite all incoming /
 * simply to \, so internally we can safely assume \.
 */
static inline char *replace_pathsep(char *s)
{
	char *c;

	for (c = s; *c != '\0'; c++) if (*c == '/') *c = '\\';
	return s;
}
#else
static inline char *replace_pathsep(char *s) { return s; }
#endif

int CDECL main(int argc, char *argv[])
{
	char pathbuf[MAX_PATH];
	const char *src_dir = ".";
	const char *dest_dir = NULL;

	while (argc > 1 && *argv[1] == '-') {
		if (strcmp(argv[1], "-v") == 0 || strcmp(argv[1], "--version") == 0) {
			puts("$Revision$");
			return 0;
		}

		if (strcmp(argv[1], "-t") == 0 || strcmp(argv[1], "--todo") == 0) {
			_show_todo |= 1;
			argc--, argv++;
			continue;
		}

		if (strcmp(argv[1], "-w") == 0 || strcmp(argv[1], "--warning") == 0) {
			_show_todo |= 2;
			argc--, argv++;
			continue;
		}

		if (strcmp(argv[1], "-h") == 0 || strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0) {
			puts(
				"strgen - $Revision$\n"
				" -v | --version    print version information and exit\n"
				" -t | --todo       replace any untranslated strings with '<TODO>'\n"
				" -w | --warning    print a warning for any untranslated strings\n"
				" -h | -? | --help  print this help message and exit\n"
				" -s | --source_dir search for english.txt in the specified directory\n"
				" -d | --dest_dir   put output file in the specified directory, create if needed\n"
				" Run without parameters and strgen will search for english.txt and parse it,\n"
				" creating strings.h. Passing an argument, strgen will translate that language\n"
				" file using english.txt as a reference and output <language>.lng."
			);
			return 0;
		}

		if (argc > 2 && (strcmp(argv[1], "-s") == 0 || strcmp(argv[1], "--source_dir") == 0)) {
			src_dir = replace_pathsep(argv[2]);
			argc -= 2, argv += 2;
			continue;
		}

		if (argc > 2 && (strcmp(argv[1], "-d") == 0 || strcmp(argv[1], "--dest_dir") == 0)) {
			dest_dir = replace_pathsep(argv[2]);
			argc -= 2, argv += 2;
			continue;
		}

		fprintf(stderr, "Invalid arguments\n");
		return 0;
	}

	if (dest_dir == NULL) dest_dir = src_dir; // if dest_dir is not specified, it equals src_dir

	/* strgen has two modes of operation. If no (free) arguments are passed
	 * strgen generates strings.h to the destination directory. If it is supplied
	 * with a (free) parameter the program will translate that language to destination
	 * directory. As input english.txt is parsed from the source directory */
	if (argc == 1) {
		mkpath(pathbuf, lengthof(pathbuf), src_dir, "english.txt");

		/* parse master file */
		_masterlang = true;
		ParseFile(pathbuf, true);
		MakeHashOfStrings();
		if (_errors) return 1;

		/* write strings.h */
		ottd_mkdir(dest_dir);
		mkpath(pathbuf, lengthof(pathbuf), dest_dir, "strings.h");
		WriteStringsH(pathbuf);
	} else if (argc == 2) {
		char *r;

		mkpath(pathbuf, lengthof(pathbuf), src_dir, "english.txt");

		/* parse master file and check if target file is correct */
		_masterlang = false;
		ParseFile(pathbuf, true);
		MakeHashOfStrings();
		ParseFile(replace_pathsep(argv[1]), false); // target file
		if (_errors) return 1;

		/* get the targetfile, strip any directories and append to destination path */
		r = strrchr(argv[1], PATHSEPCHAR);
		mkpath(pathbuf, lengthof(pathbuf), dest_dir, (r != NULL) ? &r[1] : argv[1]);

		/* rename the .txt (input-extension) to .lng */
		r = strrchr(pathbuf, '.');
		if (r == NULL || strcmp(r, ".txt") != 0) r = strchr(pathbuf, '\0');
		ttd_strlcpy(r, ".lng", (size_t)(r - pathbuf));
		WriteLangfile(pathbuf);

		/* if showing warnings, print a summary of the language */
		if ((_show_todo & 2) != 0) {
			fprintf(stdout, "%d warnings and %d errors for %s\n", _warnings, _errors, pathbuf);
		}
	} else {
		fprintf(stderr, "Invalid arguments\n");
	}

	return 0;
}