using System; using System.Diagnostics; using System.IO; using System.Collections.Generic; using System.Linq; using System.Text; using System.Text.RegularExpressions; using System.Reflection; using miew.String; using miew.Enumerable; using miew.Reflection; namespace agree { /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// Set of tokenized TDL files which comprise a grammar /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// public class GrammarFileSet { public IEnumerable<TdlTok> tdl = Enumerable.Empty<TdlTok>(); public IEnumerable<TdlTok> grules = Enumerable.Empty<TdlTok>(); public IEnumerable<TdlTok> irules = Enumerable.Empty<TdlTok>(); public IEnumerable<TdlTok> lexicon = Enumerable.Empty<TdlTok>(); public IEnumerable<TdlTok> labels = Enumerable.Empty<TdlTok>(); public IEnumerable<TdlTok> roots = Enumerable.Empty<TdlTok>(); public List<IrregInfo> irregs = null; Dictionary<Char, String> letter_sets = new Dictionary<Char, String>(); public String description = null; public String author = null; [Serializable] public struct IrregInfo { public IrregInfo(String irreg_line) { var data = irreg_line.Split(default(Char[]), StringSplitOptions.RemoveEmptyEntries); if (data.Length != 3) throw new Exception("invalid irregular inflection specification"); inflected = data[0]; s_rule = data[1]; stem = data[2]; } public String inflected; public String stem; public String s_rule; }; public GrammarFileSet(Config config, String filename) { ConfigFileReader cfr = null; String base_dir = Path.GetDirectoryName(filename); using (StringReader sr = new StringReader(miew.IO.File.Read(filename))) { String l; while ((l = sr.ReadLine()) != null) { int ix; if ((ix = l.IndexOf(';')) != -1) l = l.Remove(ix); String[] rgs = l.Split(default(Char[]), StringSplitOptions.RemoveEmptyEntries); if (rgs.Length == 0) continue; String s_keyword = rgs[0].ToLower(); if (s_keyword == "description") { description = rgs.Skip(1).StringJoin(" ").Trim('\"'); } else if (s_keyword == "author") { author = rgs.Skip(1).StringJoin(" ").Trim('\"'); } else if (s_keyword == "tokenizer") { AssemblyName an = new AssemblyName(Path.GetFileNameWithoutExtension(rgs[1])); Assembly tok_asm = Assembly.Load(an); if (tok_asm == null) throw new Exception(String.Format("The tokenizer assembly {0} could not be loaded", rgs[1])); System.Type[] rg_ttok = tok_asm .GetTypes() .Where(st => st.HasInterface(typeof(miew.Tokenization.ITokenizer))) .ToArray(); if (rg_ttok.Length == 0) throw new Exception(String.Format("No tokenizers were found in the file {0}", rgs[1])); if (rg_ttok.Length > 1) throw new Exception(String.Format("Multiple tokenizers not supported")); config.parser.TokenizerType = rg_ttok[0]; } else if (rgs.Length == 2) { String s_file = rgs[1].Trim('\"'); if (!File.Exists(s_file)) { String s2; if (Path.IsPathRooted(s_file) || !File.Exists(s2 = Path.Combine(base_dir, s_file))) throw new Exception(String.Format("The file '{0}' listed as '{1}' in '{2}' was not found", s_file, s_keyword, filename)); s_file = s2; } //Console.WriteLine("reading {0}", s_file); var tdlt = new TdlTokenizer(s_file, letter_sets); switch (s_keyword) { case "tdl": tdl = tdl.Concat(tdlt); break; case "lexicon": lexicon = lexicon.Concat(tdlt); break; case "lexical-rules": case "inflection-rules": irules = irules.Concat(tdlt); break; case "irregular-inflections": irregs = irregs ?? new List<IrregInfo>(); foreach (String _l in File.ReadAllLines(s_file)) { String irreg_line = _l.Trim('\"', ' ', '\t'); if (irreg_line != String.Empty) irregs.Add(new IrregInfo(irreg_line)); } break; case "grammar-rules": grules = grules.Concat(tdlt); break; case "node-labels": labels = labels.Concat(tdlt); break; case "start-symbols": roots = roots.Concat(tdlt); break; case "lkbconfig": if (cfr is PetGlobals) throw new Exception("cannot mix PET and LKB configuration files."); cfr = cfr ?? new LkbConfig(config); cfr.ReadConfigFile(s_file); break; case "petconfig": if (cfr is LkbConfig) throw new Exception("cannot mix LKB and PET configuration files."); cfr = cfr ?? new PetGlobals(config); cfr.ReadConfigFile(s_file); break; case "quick-check-paths": { config.parser.s_quick_check_paths = File.ReadAllLines(s_file).ToList(); #if QUICK_CHECK_ALL_NODES for (int i = 0; i < s_quick_check_paths.Count; i++) { for (int j = i + 1; j < s_quick_check_paths.Count; ) { if (s_quick_check_paths[i].StartsWith(s_quick_check_paths[j])) { Console.WriteLine("removing qc path {0}", s_quick_check_paths[j]); s_quick_check_paths.RemoveAt(j); } else j++; } } Console.WriteLine("now {0} qc paths", s_quick_check_paths.Count); #endif //quick_check_paths.AddRange(s_quick_check_paths.Select(qcp => new FsPath(qcp))); } break; default: throw new Exception(String.Format("unknown file type '{0}'", s_keyword)); } } else throw new Exception("script file error"); } } } }; /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// /// /// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// public class TdlTokenizer : IEnumerable<TdlTok> { const int tab_setting = 4; static readonly HashSet<Char> ident_stop = new HashSet<Char>("\n\r\t \";[]<>!&,.#:=".ToCharArray()); static readonly Char[] one_space = new Char[] { ' ' }; public TdlTokenizer(String s_file, Dictionary<Char, String> letter_sets) { this.s_file = s_file; this.s_cont = miew.IO.File.Read(s_file); this.letter_sets = letter_sets; } readonly String s_file; readonly String s_cont; readonly Dictionary<Char, String> letter_sets; int line = 1; int col_start = 0; int i; Char ch; ErrorPos ErrorPos { get { return new ErrorPos(line, i - col_start, s_file); } } ErrorPos NextErrorPos { get { return new ErrorPos(line, i + 1 - col_start, s_file); } } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void ProcessLetterSet(String left, String right) { if (left[0] != '(' || !right.EndsWith(")")) ErrorExit(ErrorPos, "Unmatched parentheses in letter set definition: '{0} {1}'.", left, right); String macro = left.Substring(1); String letter_set = right.Remove(right.Length - 1); if (macro.Length != 2 || macro[0] != '!') ErrorExit(ErrorPos, "Inflection letter set macro symbol must be a single character prefixed with '!'."); Char macro_char = macro[1]; if (letter_sets.ContainsKey(macro_char)) ErrorExit(ErrorPos, "Inflection letter set macro symbol '{0}' was already defined.", macro); letter_set = letter_set.Replace("\\", "\\\\").Replace("]", "\\]").Replace("-", "\\-").Replace("^", "\\^"); letter_sets.Add(macro_char, String.Format("([{0}])", letter_set)); } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// Process regular morphology specification. Because lines such as [%prefix (!f (!f)] are permitted, the /// parenthesis handling is messy. This code is also complicated by the fact that we are using system /// RegEx, so we must be careful exactly what parts of the morphology specification we escape and how we /// interpret TDL escaping. /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// TdlTok ProcessAffix(String s_subrule, bool f_pfx, int i_start, IEnumerable<String> rgs) { TdlTokMorphology tokm = f_pfx ? (TdlTokMorphology) new TdlTokPrefix(line, i_start - col_start, s_file) : new TdlTokSuffix(line, i_start - col_start, s_file); /// A list of distinct (arbitrary) placeholder characters and their eventual string replacements List<KeyValuePair<Char, String>> inserts = new List<KeyValuePair<Char, String>>(); /// Starting placeholder character Char ch_ins = '\uE000'; /// Process each tuple on the morphology specifcation line foreach (var pair in rgs.PairOff()) { /// To handle [%prefix (!f (!f)], we only strip the mandatory outer parentheses after the tuples have /// been paired off. if (pair.Key[0] != '(' || !pair.Value.EndsWith(")")) ErrorExit(ErrorPos, "Unmatched parentheses in inflectional rule: '{0}'.", s_subrule); /// For parsing, the affix is already attached, so the right side of each subrule tuple is the input, and /// the left side is the output. Hence the reversal String s_inp = pair.Value.Remove(pair.Value.Length - 1).Replace("*", ""); String s_out = pair.Key.Substring(1).Replace("*", ""); /// 1. The index of the first RegEx capture group depends on whether it's a prefix or suffix int i_capture = f_pfx ? 1 : 2; /// 2. Replace each macro sequence with a unique unicode character taken from the private use area int ix_additional_use, ix_bang = 0; while ((ix_bang = s_inp.IndexOf('!', ix_bang)) != -1 && ix_bang < s_inp.Length - 1) { Char macro_char = s_inp[ix_bang + 1]; String macro = "!" + macro_char; if (letter_sets == null) ErrorExit(ErrorPos, "Macro symbol {0} used in inflectional pair {1} {2} must be defined in the same file before using it. There are no macro symbols defined.", macro, pair.Key, pair.Value); String rx_part; if (!letter_sets.TryGetValue(macro_char, out rx_part)) ErrorExit(ErrorPos, "Macro symbol {0} used in inflectional pair {1} {2} was not defined.", macro, pair.Key, pair.Value); if (!s_out.Contains(macro_char)) ErrorExit(ErrorPos, "Inflectional pair {0} {1} uses macro symbol {2} in the input but not the output.", pair.Key, pair.Value, macro); /// Generate a new placeholder for the first occurrence of this macro in what will be the input RegEx s_inp = s_inp.Remove(ix_bang, 2).Insert(ix_bang, ch_ins.ToString()); inserts.Add(new KeyValuePair<Char, String>(ch_ins++, rx_part)); /// Check for additional uses of this macro, still in the *input* RegEx ix_additional_use = s_inp.IndexOf(macro); if (ix_additional_use != -1) { do s_inp = s_inp.Remove(ix_additional_use, 2).Insert(ix_additional_use, ch_ins.ToString()); while ((ix_additional_use = s_inp.IndexOf(macro, ix_additional_use)) != -1); inserts.Add(new KeyValuePair<Char, String>(ch_ins++, @"\" + i_capture.ToString())); } /// Reference the corresponding capture in the *replacement* part of the RegEx s_out = s_out.Replace(macro, "$" + i_capture.ToString()); /// Keep track of the capture indexes that the RegEx will generate i_capture++; } /// 3. Now that macros are out of the way, we can escape the non-macro parts for RegEx s_inp = Regex.Escape(s_inp); /// 4. Now we can replace each placeholder with its corresponding string foreach (var kvp in inserts) s_inp = s_inp.Replace(kvp.Key.ToString(), kvp.Value); /// 5. Complete the RegEx expression by including the non-affixed body if (f_pfx) { s_inp = "^" + s_inp + "(.*)$"; s_out = s_out + "$" + i_capture.ToString(); } else { s_inp = "^(.*)" + s_inp + "$"; s_out = "$1" + s_out; } /// 6. Create a RegEx and add it to the Tdl token tokm.subrules.Add(new MorphologySubrule { regex = new Regex(s_inp, RegexOptions.Compiled | RegexOptions.Singleline), replace = s_out }); } return tokm; } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// TdlTok LineBegin() { if (s_cont[i] == '%') { int i_start = ++i; while (i < s_cont.Length && (ch = s_cont[i]) != 13 && ch != 10) i++; String s_subrule = s_cont.Substring(i_start, i - i_start).Trim(); if (s_subrule.Length == 0) throw new TdlException(ErrorPos, "Expected a morphology specification after '%'"); if (s_subrule[0] == '(' && s_subrule[s_subrule.Length - 1] == ')') s_subrule = s_subrule.Substring(1, s_subrule.Length - 2); String[] rgs = s_subrule .Replace(@"\(", "(") .Replace(@"\)", ")") .Replace(@"\!", "!") .Replace(@"\?", "?") .Split(one_space, StringSplitOptions.RemoveEmptyEntries); if (rgs.Length < 2) ErrorExit(ErrorPos, "Syntax error in inflectional rule: '{0}'.", s_subrule); switch (rgs[0].ToLower()) { case "letter-set": if (rgs.Length != 3) ErrorExit(ErrorPos, "Invalid inflection letter set definition: {0}.", s_subrule); ProcessLetterSet(rgs[1], rgs[2]); break; case "prefix": if (((rgs.Length - 1) & 1) > 0) TdlTokenizer.ErrorExit(ErrorPos, "Invalid inflectional input/output pairing."); return ProcessAffix(s_subrule, true, i_start, rgs.Skip(1)); case "suffix": if (((rgs.Length - 1) & 1) > 0) TdlTokenizer.ErrorExit(ErrorPos, "Invalid inflectional input/output pairing."); return ProcessAffix(s_subrule, false, i_start, rgs.Skip(1)); default: ErrorExit(ErrorPos, "Unrecognized inflectional rule type: '{0}'.", rgs[0].ToLower()); break; } } return null; } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// TdlTok LineEnd() { if (i < s_cont.Length && ch == 13 && s_cont[i] == 10) // windows cr-lf i++; col_start = i; line++; return i < s_cont.Length ? LineBegin() : null; } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// public IEnumerator<TdlTok> GetEnumerator() { int sq_nest = 0; int ang_nest = 0; int dl_nest = 0; i = 0; TdlTok tokm = LineBegin(); if (tokm != null) yield return tokm; while (i < s_cont.Length) { ch = s_cont[i]; if (ch == 13 || ch == 10) { i++; if ((tokm = LineEnd()) != null) yield return tokm; } else if (ch == '\"' || ch == '\'') // double-quoted or lisp-style 'string { Char ch_term = ch == '\"' ? ch : ' '; i++; int i_start = i; while (i < s_cont.Length) { ch = s_cont[i++]; if (ch == ch_term) break; if (ch == 13 || ch == 10) if ((tokm = LineEnd()) != null) yield return tokm; } if (i >= s_cont.Length) ErrorExit(ErrorPos, "End-of-file reading string constant."); yield return new TdlTok(line, i - i_start, s_file, TdlTok.Type.String, s_cont.Substring(i_start, i - 1 - i_start)); } else if (ch == ' ') // ignore whitespace { i++; } else if (ch == '\t') // ignore tab { i++; col_start -= tab_setting; } else if (ch == ';') // comment extends to end of line { while (i < s_cont.Length && (ch = s_cont[i]) != 13 && ch != 10) i++; i++; if ((tokm = LineEnd()) != null) yield return tokm; } else if (ch == ':') { while (++i < s_cont.Length && (ch = s_cont[i]) == ' ') ; if (ch == '=' || ch == '<') yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.Define); else if (ch == '+') yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.Append); else ErrorExit(NextErrorPos, "Error: ':" + ch + "'; expected :=, :<, or :+"); i++; } else if (ch == '[') { i++; sq_nest++; yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.SquareOpen); } else if (ch == ']') { i++; if (sq_nest == 0) ErrorExit(ErrorPos, "Unmatched closing square-bracket ']'."); yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.SquareClose); sq_nest--; } else if (ch == '<') { while (++i < s_cont.Length && (ch = s_cont[i]) == ' ') ; if (ch == '!') { i++; dl_nest++; yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.DifferenceListOpen); } else { ang_nest++; yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.AngleOpen); } } else if (ch == '>') { i++; if (ang_nest == 0) ErrorExit(ErrorPos, "Unmatched closing angle-bracket '>'."); yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.AngleClose); ang_nest--; } else if (ch == '!') { while (++i < s_cont.Length && (ch = s_cont[i]) == ' ') ; if (ch != '>') ErrorExit(NextErrorPos, "Error: '!" + ch + "'"); if (dl_nest == 0) ErrorExit(ErrorPos, "Unmatched difference list closing symbol '!>'."); i++; yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.DifferenceListClose); dl_nest--; } else if (ch == '&') { i++; yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.Ampersand); } else if (ch == ',') { i++; yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.Comma); } else if (ch == '.') { i++; if (i + 1 < s_cont.Length && s_cont[i] == '.' && s_cont[i + 1] == '.') { yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.Ellipsis); i += 2; } else yield return new TdlTok(line, i - col_start, s_file, TdlTok.Type.Dot); } else if (ch == '#' && i + 1 < s_cont.Length && s_cont[i + 1] == '|') // block comment { i += 2; while (i < s_cont.Length - 1 && ((ch = s_cont[i]) != '|' || s_cont[i + 1] != '#')) { i++; if (ch == 13 || ch == 10) if ((tokm = LineEnd()) != null) yield return tokm; } i += 2; if (i > s_cont.Length) ErrorExit(ErrorPos, "End-of-file in block comment."); } else if (ch == '#') { i++; int i_start = i; while (++i < s_cont.Length && !ident_stop.Contains(ch = s_cont[i])) ; if (i_start == i) ErrorExit(NextErrorPos, "Unexpected character: " + ch); yield return new TdlTok(line, i_start - col_start, s_file, TdlTok.Type.Tag, s_cont.Substring(i_start, i - i_start).ToLower()); } else // identifier { int i_start = i; while (++i < s_cont.Length && !ident_stop.Contains(ch = s_cont[i])) ; if (i_start == i) ErrorExit(NextErrorPos, "Unexpected character: " + ch); yield return new TdlTok(line, i_start - col_start, s_file, TdlTok.Type.Identifier, s_cont.Substring(i_start, i - i_start).ToLower()); } } } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// public static void ErrorExit(ErrorPos t, String msg) { TdlTok tk = t as TdlTok; String s_msg; if (tk != null) s_msg = String.Format("Error in {0}, line {1}, col {2}: {3} : {4}", t.file, t.line, t.col, tk, msg); else s_msg = String.Format("Error in {0}, line {1}, col {2}: {3}", t.file, t.line, t.col, msg); throw new TdlException(t, s_msg); } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// public static void ErrorExit(ErrorPos t, String fmt, params Object[] args) { ErrorExit(t, String.Format(fmt, args)); } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// public static void ErrorExit(String msg) { throw new TdlException("Error: {0}", msg); } System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator() { throw new NotImplementedException(); } }; }