using System;
using System.Diagnostics;
using System.IO;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Reflection;

using miew.String;
using miew.Enumerable;
using miew.Reflection;

namespace agree
{

	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
	/// <summary>
	/// Set of tokenized TDL files which comprise a grammar
	/// </summary>
	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
	public class GrammarFileSet
	{
		public IEnumerable<TdlTok> tdl = Enumerable.Empty<TdlTok>();
		public IEnumerable<TdlTok> grules = Enumerable.Empty<TdlTok>();
		public IEnumerable<TdlTok> irules = Enumerable.Empty<TdlTok>();
		public IEnumerable<TdlTok> lexicon = Enumerable.Empty<TdlTok>();
		public IEnumerable<TdlTok> labels = Enumerable.Empty<TdlTok>();
		public IEnumerable<TdlTok> roots = Enumerable.Empty<TdlTok>();
		public List<IrregInfo> irregs = null;
		Dictionary<Char, String> letter_sets = new Dictionary<Char, String>();
		public String description = null;
		public String author = null;

		[Serializable]
		public struct IrregInfo
		{
			public IrregInfo(String irreg_line)
			{
				var data = irreg_line.Split(default(Char[]), StringSplitOptions.RemoveEmptyEntries);
				if (data.Length != 3)
					throw new Exception("invalid irregular inflection specification");
				inflected = data[0];
				s_rule = data[1];
				stem = data[2];
			}
			public String inflected;
			public String stem;
			public String s_rule;
		};

		public GrammarFileSet(Config config, String filename)
		{
			ConfigFileReader cfr = null;

			String base_dir = Path.GetDirectoryName(filename);
			using (StringReader sr = new StringReader(miew.IO.File.Read(filename)))
			{
				String l;
				while ((l = sr.ReadLine()) != null)
				{
					int ix;
					if ((ix = l.IndexOf(';')) != -1)
						l = l.Remove(ix);

					String[] rgs = l.Split(default(Char[]), StringSplitOptions.RemoveEmptyEntries);
					if (rgs.Length == 0)
						continue;
					String s_keyword = rgs[0].ToLower();
					if (s_keyword == "description")
					{
						description = rgs.Skip(1).StringJoin(" ").Trim('\"');
					}
					else if (s_keyword == "author")
					{
						author = rgs.Skip(1).StringJoin(" ").Trim('\"');
					}
					else if (s_keyword == "tokenizer")
					{
						AssemblyName an = new AssemblyName(Path.GetFileNameWithoutExtension(rgs[1]));
						Assembly tok_asm = Assembly.Load(an);

						if (tok_asm == null)
							throw new Exception(String.Format("The tokenizer assembly {0} could not be loaded", rgs[1]));

						System.Type[] rg_ttok = tok_asm
												.GetTypes()
												.Where(st => st.HasInterface(typeof(miew.Tokenization.ITokenizer)))
												.ToArray();
						if (rg_ttok.Length == 0)
							throw new Exception(String.Format("No tokenizers were found in the file {0}", rgs[1]));

						if (rg_ttok.Length > 1)
							throw new Exception(String.Format("Multiple tokenizers not supported"));

						config.parser.TokenizerType = rg_ttok[0];
					}
					else if (rgs.Length == 2)
					{
						String s_file = rgs[1].Trim('\"');

						if (!File.Exists(s_file))
						{
							String s2;
							if (Path.IsPathRooted(s_file) || !File.Exists(s2 = Path.Combine(base_dir, s_file)))
								throw new Exception(String.Format("The file '{0}' listed as '{1}' in '{2}' was not found", s_file, s_keyword, filename));
							s_file = s2;
						}

						//Console.WriteLine("reading {0}", s_file);
						var tdlt = new TdlTokenizer(s_file, letter_sets);
						switch (s_keyword)
						{
							case "tdl":
								tdl = tdl.Concat(tdlt);
								break;
							case "lexicon":
								lexicon = lexicon.Concat(tdlt);
								break;
							case "lexical-rules":
							case "inflection-rules":
								irules = irules.Concat(tdlt);
								break;
							case "irregular-inflections":
								irregs = irregs ?? new List<IrregInfo>();
								foreach (String _l in File.ReadAllLines(s_file))
								{
									String irreg_line = _l.Trim('\"', ' ', '\t');
									if (irreg_line != String.Empty)
										irregs.Add(new IrregInfo(irreg_line));
								}
								break;
							case "grammar-rules":
								grules = grules.Concat(tdlt);
								break;
							case "node-labels":
								labels = labels.Concat(tdlt);
								break;
							case "start-symbols":
								roots = roots.Concat(tdlt);
								break;
							case "lkbconfig":
								if (cfr is PetGlobals)
									throw new Exception("cannot mix PET and LKB configuration files.");
								cfr = cfr ?? new LkbConfig(config);
								cfr.ReadConfigFile(s_file);
								break;
							case "petconfig":
								if (cfr is LkbConfig)
									throw new Exception("cannot mix LKB and PET configuration files.");
								cfr = cfr ?? new PetGlobals(config);
								cfr.ReadConfigFile(s_file);
								break;
							case "quick-check-paths":
								{
									config.parser.s_quick_check_paths = File.ReadAllLines(s_file).ToList();
#if QUICK_CHECK_ALL_NODES
									for (int i = 0; i < s_quick_check_paths.Count; i++)
									{
										for (int j = i + 1; j < s_quick_check_paths.Count; )
										{
											if (s_quick_check_paths[i].StartsWith(s_quick_check_paths[j]))
											{
												Console.WriteLine("removing qc path {0}", s_quick_check_paths[j]);
												s_quick_check_paths.RemoveAt(j);
											}
											else
												j++;
										}
									}
									Console.WriteLine("now {0} qc paths", s_quick_check_paths.Count);
#endif
									//quick_check_paths.AddRange(s_quick_check_paths.Select(qcp => new FsPath(qcp)));
								}
								break;
							default:
								throw new Exception(String.Format("unknown file type '{0}'", s_keyword));
						}
					}
					else
						throw new Exception("script file error");
				}
			}
		}
	};


	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
	///
	///
	///
	///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
	public class TdlTokenizer : IEnumerable<TdlTok>
	{
		const int tab_setting = 4;
		static readonly HashSet<Char> ident_stop = new HashSet<Char>("\n\r\t \";[]<>!&,.#:=".ToCharArray());
		static readonly Char[] one_space = new Char[] { ' ' };

		public TdlTokenizer(String s_file, Dictionary<Char, String> letter_sets)
		{
			this.s_file = s_file;
			this.s_cont = miew.IO.File.Read(s_file);
			this.letter_sets = letter_sets;
		}

		readonly String s_file;
		readonly String s_cont;
		readonly Dictionary<Char, String> letter_sets;

		int line = 1;
		int col_start = 0;
		int i;
		Char ch;

		ErrorPos ErrorPos { get { return new ErrorPos(line, i - col_start, s_file); } }
		ErrorPos NextErrorPos { get { return new ErrorPos(line, i + 1 - col_start, s_file); } }


		///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
		/// <summary>
		/// 
		/// </summary>
		///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
		void ProcessLetterSet(String left, String right)
		{
			if (left[0] != '(' || !right.EndsWith(")"))
				ErrorExit(ErrorPos, "Unmatched parentheses in letter set definition: '{0} {1}'.", left, right);

			String macro = left.Substring(1);
			String letter_set = right.Remove(right.Length - 1);

			if (macro.Length != 2 || macro[0] != '!')
				ErrorExit(ErrorPos, "Inflection letter set macro symbol must be a single character prefixed with '!'.");

			Char macro_char = macro[1];
			if (letter_sets.ContainsKey(macro_char))
				ErrorExit(ErrorPos, "Inflection letter set macro symbol '{0}' was already defined.", macro);

			letter_set = letter_set.Replace("\\", "\\\\").Replace("]", "\\]").Replace("-", "\\-").Replace("^", "\\^");

			letter_sets.Add(macro_char, String.Format("([{0}])", letter_set));
		}

		///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
		/// <summary>
		/// Process regular morphology specification. Because lines such as [%prefix (!f (!f)] are permitted, the 
		/// parenthesis handling is messy. This code is also complicated by the fact that we are using system
		/// RegEx, so we must be careful exactly what parts of the morphology specification we escape and how we
		/// interpret TDL escaping.
		/// </summary>
		///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
		TdlTok ProcessAffix(String s_subrule, bool f_pfx, int i_start, IEnumerable<String> rgs)
		{
			TdlTokMorphology tokm = f_pfx ? (TdlTokMorphology)
				new TdlTokPrefix(line, i_start - col_start, s_file) :
				new TdlTokSuffix(line, i_start - col_start, s_file);

			/// A list of distinct (arbitrary) placeholder characters and their eventual string replacements
			List<KeyValuePair<Char, String>> inserts = new List<KeyValuePair<Char, String>>();

			/// Starting placeholder character
			Char ch_ins = '\uE000';

			/// Process each tuple on the morphology specifcation line
			foreach (var pair in rgs.PairOff())
			{
				/// To handle [%prefix (!f (!f)], we only strip the mandatory outer parentheses after the tuples have
				/// been paired off.
				if (pair.Key[0] != '(' || !pair.Value.EndsWith(")"))
					ErrorExit(ErrorPos, "Unmatched parentheses in inflectional rule: '{0}'.", s_subrule);

				/// For parsing, the affix is already attached, so the right side of each subrule tuple is the input, and
				/// the left side is the output. Hence the reversal
				String s_inp = pair.Value.Remove(pair.Value.Length - 1).Replace("*", "");
				String s_out = pair.Key.Substring(1).Replace("*", "");

				/// 1. The index of the first RegEx capture group depends on whether it's a prefix or suffix
				int i_capture = f_pfx ? 1 : 2;

				/// 2. Replace each macro sequence with a unique unicode character taken from the private use area
				int ix_additional_use, ix_bang = 0;
				while ((ix_bang = s_inp.IndexOf('!', ix_bang)) != -1 && ix_bang < s_inp.Length - 1)
				{
					Char macro_char = s_inp[ix_bang + 1];
					String macro = "!" + macro_char;
					if (letter_sets == null)
						ErrorExit(ErrorPos, "Macro symbol {0} used in inflectional pair {1} {2} must be defined in the same file before using it. There are no macro symbols defined.", macro, pair.Key, pair.Value);

					String rx_part;
					if (!letter_sets.TryGetValue(macro_char, out rx_part))
						ErrorExit(ErrorPos, "Macro symbol {0} used in inflectional pair {1} {2} was not defined.", macro, pair.Key, pair.Value);

					if (!s_out.Contains(macro_char))
						ErrorExit(ErrorPos, "Inflectional pair {0} {1} uses macro symbol {2} in the input but not the output.", pair.Key, pair.Value, macro);

					/// Generate a new placeholder for the first occurrence of this macro in what will be the input RegEx
					s_inp = s_inp.Remove(ix_bang, 2).Insert(ix_bang, ch_ins.ToString());
					inserts.Add(new KeyValuePair<Char, String>(ch_ins++, rx_part));

					/// Check for additional uses of this macro, still in the *input* RegEx
					ix_additional_use = s_inp.IndexOf(macro);
					if (ix_additional_use != -1)
					{
						do
							s_inp = s_inp.Remove(ix_additional_use, 2).Insert(ix_additional_use, ch_ins.ToString());
						while ((ix_additional_use = s_inp.IndexOf(macro, ix_additional_use)) != -1);
						inserts.Add(new KeyValuePair<Char, String>(ch_ins++, @"\" + i_capture.ToString()));
					}

					/// Reference the corresponding capture in the *replacement* part of the RegEx
					s_out = s_out.Replace(macro, "$" + i_capture.ToString());

					/// Keep track of the capture indexes that the RegEx will generate
					i_capture++;
				}

				/// 3. Now that macros are out of the way, we can escape the non-macro parts for RegEx
				s_inp = Regex.Escape(s_inp);

				/// 4. Now we can replace each placeholder with its corresponding string
				foreach (var kvp in inserts)
					s_inp = s_inp.Replace(kvp.Key.ToString(), kvp.Value);

				/// 5. Complete the RegEx expression by including the non-affixed body
				if (f_pfx)
				{
					s_inp = "^" + s_inp + "(.*)$";
					s_out = s_out + "$" + i_capture.ToString();
				}
				else
				{
					s_inp = "^(.*)" + s_inp + "$";
					s_out = "$1" + s_out;
				}

				/// 6. Create a RegEx and add it to the Tdl token
				tokm.subrules.Add(new MorphologySubrule
				{
					regex = new Regex(s_inp, RegexOptions.Compiled | RegexOptions.Singleline),
					replace = s_out
				});
			}
			return tokm;
		}

		///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
		/// <summary>
		/// 
		/// </summary>
		///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
		TdlTok LineBegin()
		{
			if (s_cont[i] == '%')
			{
				int i_start = ++i;
				while (i < s_cont.Length && (ch = s_cont[i]) != 13 && ch != 10)
					i++;

				String s_subrule = s_cont.Substring(i_start, i - i_start).Trim();
				if (s_subrule.Length == 0)
					throw new TdlException(ErrorPos, "Expected a morphology specification after '%'");

				if (s_subrule[0] == '(' && s_subrule[s_subrule.Length - 1] == ')')
					s_subrule = s_subrule.Substring(1, s_subrule.Length - 2);

				String[] rgs = s_subrule
					.Replace(@"\(", "(")
					.Replace(@"\)", ")")
					.Replace(@"\!", "!")
					.Replace(@"\?", "?")
					.Split(one_space, StringSplitOptions.RemoveEmptyEntries);

				if (rgs.Length < 2)
					ErrorExit(ErrorPos, "Syntax error in inflectional rule: '{0}'.", s_subrule);

				switch (rgs[0].ToLower())
				{
					case "letter-set":
						if (rgs.Length != 3)
							ErrorExit(ErrorPos, "Invalid inflection letter set definition: {0}.", s_subrule);
						ProcessLetterSet(rgs[1], rgs[2]);
						break;

					case "prefix":
						if (((rgs.Length - 1) & 1) > 0)
							TdlTokenizer.ErrorExit(ErrorPos, "Invalid inflectional input/output pairing.");
						return ProcessAffix(s_subrule, true, i_start, rgs.Skip(1));

					case "suffix":
						if (((rgs.Length - 1) & 1) > 0)
							TdlTokenizer.ErrorExit(ErrorPos, "Invalid inflectional input/output pairing.");
						return ProcessAffix(s_subrule, false, i_start, rgs.Skip(1));

					default:
						ErrorExit(ErrorPos, "Unrecognized inflectional rule type: '{0}'.", rgs[0].ToLower());
						break;
				}
			}
			return null;
		}

		///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
		/// <summary>
		/// 
		/// </summary>
		///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
		TdlTok LineEnd()
		{
			if (i < s_cont.Length && ch == 13 && s_cont[i] == 10)	// windows cr-lf
				i++;
			col_start = i;
			line++;
			return i < s_cont.Length ? LineBegin() : null;
		}

		///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
		/// <summary>
		/// 
		/// </summary>
		///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
		public IEnumerator<TdlTok> GetEnumerator()
		{
			int sq_nest = 0;
			int ang_nest = 0;
			int dl_nest = 0;
			i = 0;

			TdlTok tokm = LineBegin();
			if (tokm != null)
				yield return tokm;

			while (i < s_cont.Length)
			{
				ch = s_cont[i];
				if (ch == 13 || ch == 10)
				{
					i++;
					if ((tokm = LineEnd()) != null)
						yield return tokm;
				}
				else if (ch == '\"' || ch == '\'')	// double-quoted or lisp-style 'string
				{
					Char ch_term = ch == '\"' ? ch : ' ';
					i++;
					int i_start = i;
					while (i < s_cont.Length)
					{
						ch = s_cont[i++];
						if (ch == ch_term)
							break;
						if (ch == 13 || ch == 10)
							if ((tokm = LineEnd()) != null)
								yield return tokm;
					}

					if (i >= s_cont.Length)
						ErrorExit(ErrorPos, "End-of-file reading string constant.");

					yield return new TdlTok(line, i - i_start, s_file, TdlTok.Type.String, s_cont.Substring(i_start, i - 1 - i_start));
				}
				else if (ch == ' ')		// ignore whitespace
				{
					i++;
				}
				else if (ch == '\t')	// ignore tab
				{
					i++;
					col_start -= tab_setting;
				}
				else if (ch == ';')		// comment extends to end of line
				{
					while (i < s_cont.Length && (ch = s_cont[i]) != 13 && ch != 10)
						i++;
					i++;
					if ((tokm = LineEnd()) != null)
						yield return tokm;
				}
				else if (ch == ':')
				{
					while (++i < s_cont.Length && (ch = s_cont[i]) == ' ')
						;

					if (ch == '=' || ch == '<')
						yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.Define);
					else if (ch == '+')
						yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.Append);
					else
						ErrorExit(NextErrorPos, "Error: ':" + ch + "'; expected :=, :<, or :+");
					i++;
				}
				else if (ch == '[')
				{
					i++;
					sq_nest++;
					yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.SquareOpen);
				}
				else if (ch == ']')
				{
					i++;
					if (sq_nest == 0)
						ErrorExit(ErrorPos, "Unmatched closing square-bracket ']'.");
					yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.SquareClose);
					sq_nest--;
				}
				else if (ch == '<')
				{
					while (++i < s_cont.Length && (ch = s_cont[i]) == ' ')
						;
					if (ch == '!')
					{
						i++;
						dl_nest++;
						yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.DifferenceListOpen);
					}
					else
					{
						ang_nest++;
						yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.AngleOpen);
					}
				}
				else if (ch == '>')
				{
					i++;
					if (ang_nest == 0)
						ErrorExit(ErrorPos, "Unmatched closing angle-bracket '>'.");
					yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.AngleClose);
					ang_nest--;
				}
				else if (ch == '!')
				{
					while (++i < s_cont.Length && (ch = s_cont[i]) == ' ')
						;
					if (ch != '>')
						ErrorExit(NextErrorPos, "Error: '!" + ch + "'");
					if (dl_nest == 0)
						ErrorExit(ErrorPos, "Unmatched difference list closing symbol '!>'.");

					i++;
					yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.DifferenceListClose);
					dl_nest--;
				}
				else if (ch == '&')
				{
					i++;
					yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.Ampersand);
				}
				else if (ch == ',')
				{
					i++;
					yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.Comma);
				}
				else if (ch == '.')
				{
					i++;
					if (i + 1 < s_cont.Length && s_cont[i] == '.' && s_cont[i + 1] == '.')
					{
						yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.Ellipsis);
						i += 2;
					}
					else
						yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.Dot);
				}
				else if (ch == '#' && i + 1 < s_cont.Length && s_cont[i + 1] == '|')		// block comment
				{
					i += 2;
					while (i < s_cont.Length - 1 && ((ch = s_cont[i]) != '|' || s_cont[i + 1] != '#'))
					{
						i++;
						if (ch == 13 || ch == 10)
							if ((tokm = LineEnd()) != null)
								yield return tokm;
					}
					i += 2;
					if (i > s_cont.Length)
						ErrorExit(ErrorPos, "End-of-file in block comment.");
				}
				else if (ch == '#')
				{
					i++;
					int i_start = i;
					while (++i < s_cont.Length && !ident_stop.Contains(ch = s_cont[i]))
						;

					if (i_start == i)
						ErrorExit(NextErrorPos, "Unexpected character: " + ch);

					yield return new TdlTok(line, i_start - col_start, s_file, TdlTok.Type.Tag, s_cont.Substring(i_start, i - i_start).ToLower());
				}
				else				// identifier
				{
					int i_start = i;
					while (++i < s_cont.Length && !ident_stop.Contains(ch = s_cont[i]))
						;

					if (i_start == i)
						ErrorExit(NextErrorPos, "Unexpected character: " + ch);

					yield return new TdlTok(line, i_start - col_start, s_file, TdlTok.Type.Identifier, s_cont.Substring(i_start, i - i_start).ToLower());
				}
			}
		}

		///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
		/// <summary>
		/// 
		/// </summary>
		///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
		public static void ErrorExit(ErrorPos t, String msg)
		{
			TdlTok tk = t as TdlTok;
			String s_msg;
			if (tk != null)
				s_msg = String.Format("Error in {0}, line {1}, col {2}: {3} : {4}", t.file, t.line, t.col, tk, msg);
			else
				s_msg = String.Format("Error in {0}, line {1}, col {2}: {3}", t.file, t.line, t.col, msg);
			throw new TdlException(t, s_msg);
		}

		///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
		/// <summary>
		/// 
		/// </summary>
		///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
		public static void ErrorExit(ErrorPos t, String fmt, params Object[] args)
		{
			ErrorExit(t, String.Format(fmt, args));
		}

		///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
		/// <summary>
		/// 
		/// </summary>
		///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
		public static void ErrorExit(String msg)
		{
			throw new TdlException("Error: {0}", msg);
		}

		System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator()
		{
			throw new NotImplementedException();
		}
	};
}