using System; using System.Diagnostics; using System.IO; using System.Collections.Generic; using System.Linq; using System.Text; using System.Text.RegularExpressions; using miew.String; using miew.Enumerable; namespace agree { /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// Error position information in a TDL file /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// public class ErrorPos { public ErrorPos(int l, int c, String f) { line = l; col = c; file = f; } public int line; public int col; public String file; } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// TDL token /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// public class TdlTok : ErrorPos { static public byte[] TokMap_Empty; static public byte[] TokMap_Comma; static public byte[] TokMap_SqCl; static public byte[] TokMap_AngCl; static public byte[] TokMap_AngCl_Comma; static public byte[] TokMap_AngCl_Comma_Dot; static public byte[] TokMap_DlsCl_Comma; static int c_enum; static TdlTok() { c_enum = Enum.GetValues(typeof(TdlTok.Type)).Cast<int>().Max() + 1; TokMap_Empty = TdlTok.CreateTokenMap(); TokMap_SqCl = TdlTok.CreateTokenMap(TdlTok.Type.SquareClose); TokMap_Comma = TdlTok.CreateTokenMap(TdlTok.Type.Comma); TokMap_AngCl = TdlTok.CreateTokenMap(TdlTok.Type.AngleClose); TokMap_AngCl_Comma = TdlTok.CreateTokenMap(TdlTok.Type.Comma, TdlTok.Type.AngleClose); TokMap_AngCl_Comma_Dot = TdlTok.CreateTokenMap(TdlTok.Type.Comma, TdlTok.Type.AngleClose, TdlTok.Type.Dot); TokMap_DlsCl_Comma = TdlTok.CreateTokenMap(TdlTok.Type.Comma, TdlTok.Type.DifferenceListClose); } static public byte[] CreateTokenMap(params Type[] args) { byte[] tok_map = Enumerable.Repeat<byte>(0, c_enum).ToArray(); foreach (Type t in args) tok_map[(int)t] = 1; return tok_map; } public enum Type { Identifier = 1, String, Tag, // #... // single character tokens Ampersand, // & Dot, // . Comma, // , SquareOpen, // [ SquareClose, // ] AngleOpen, // < AngleClose, // > // atomic 2-character tokens (can contain whitespace because of matrix.tdl) Append, // :+ Define, // := : = :< DifferenceListOpen, // <! < ! DifferenceListClose, // !> ! > Ellipsis, // ... // morphology specification also uses inherited type Morphology, }; public TdlTok(int l, int c, String f, Type t) : base(l, c, f) { this.t = t; } public TdlTok(int l, int c, String f, Type t, String s) : base(l, c, f) { this.t = t; this.i_s = String.Intern(s); } readonly public Type t; public String i_s; public int c_parts; public String _Debug() { return String.Format("{0} {1} {2} {3} {4}", file, line, col, t.ToString(), i_s == null ? String.Empty : i_s); } public override String ToString() { String s = t.Render(); if (s != null) return s; if (this is TdlTokPrefix) return " %prefix " + i_s + " "; if (this is TdlTokSuffix) return " %suffix " + i_s + " "; switch (t) { case TdlTok.Type.Identifier: return " " + i_s + " "; case TdlTok.Type.String: return " \"" + i_s + "\" "; case TdlTok.Type.Tag: return " #" + i_s + " "; default: throw new Exception(); } } }; [Serializable] [DebuggerDisplay("{ToString(),nq}")] public struct MorphologySubrule { public Regex regex; public String replace; public override string ToString() { return String.Format("{0} {1}", regex, replace); } }; abstract class TdlTokMorphology : TdlTok { public TdlTokMorphology(int l, int c, String f) : base(l, c, f, Type.Morphology) { } public List<MorphologySubrule> subrules = new List<MorphologySubrule>(); }; class TdlTokPrefix : TdlTokMorphology { public TdlTokPrefix(int l, int c, String f) : base(l, c, f) { } }; class TdlTokSuffix : TdlTokMorphology { public TdlTokSuffix(int l, int c, String f) : base(l, c, f) { } }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// /// TDL token extension function(s) /// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// public static class TdlTokExt { public static String Render(this TdlTok.Type t) { switch (t) { case TdlTok.Type.Ampersand: return "&"; case TdlTok.Type.Dot: return "."; case TdlTok.Type.Comma: return ","; case TdlTok.Type.SquareOpen: return "["; case TdlTok.Type.SquareClose: return "]"; case TdlTok.Type.AngleOpen: return "<"; case TdlTok.Type.AngleClose: return ">"; case TdlTok.Type.Append: return ":+"; case TdlTok.Type.Define: return ":="; // also, :< case TdlTok.Type.DifferenceListOpen: return "<!"; case TdlTok.Type.DifferenceListClose: return "!>"; case TdlTok.Type.Ellipsis: return "..."; default: return null; } } }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// /// Base Feature or Base Constraint /// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// [DebuggerDisplay("{ToString(),nq}")] public class BaseFeatConstraint : List<TdlTok> { public override String ToString() { return this.StringJoin(" "); } }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// /// Token Definition Group /// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// public class TokenDefinitionGroup { public enum Type { Tdl, GrammarRule, LexicalRule, Lexicon, Label, Start, }; public TokenDefinitionGroup(Type type, TdlTok _t) { this.type = type; this.tok_ident = _t; } /*readonly*/ public Type type; readonly public TdlTok tok_ident; // identifier public bool f_append; // false for := true for :+ public List<TdlTok> conj_par = new List<TdlTok>(); // conjoined parents public List<BaseFeatConstraint> m_bfc; // contents of feature structure(s) public String top_comment; // conjoined top-level comment public List<MorphologySubrule> morph_subrules; // morphology subrules public String Render() { String ret = String.Format("Type:\t{0}\n", tok_ident.i_s); if (conj_par != null && conj_par.Count > 0) ret += String.Format("Parents:\t{0}\n", conj_par.Select(e => e.i_s).StringJoin(" ")); if (m_bfc != null) { ret += "Constraint:\n\t"; ret += m_bfc; } if (top_comment != null) ret += String.Format("Comment:\t{0}\n", top_comment); return ret; } public String Identifier { get { return tok_ident.i_s; } } }; /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// Helper class for assembling a stream of TDL tokens into token definition groups /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// class TokenGrouper : IEnumerable<TokenDefinitionGroup> { readonly static Char[] one_space = new Char[] { ' ' }; public TokenGrouper( TokenDefinitionGroup.Type type, IEnumerable<TdlTok> ie) { this.type = type; this.ie = ie; } readonly TokenDefinitionGroup.Type type; readonly IEnumerable<TdlTok> ie; IEnumerator<TdlTok> iett; TdlTok prv; TdlTok cur; bool f_inhibit_next_advance; Stack<TdlTok> nest; bool Advance() { if (f_inhibit_next_advance) f_inhibit_next_advance = false; else if (!iett.MoveNext()) return false; prv = cur; cur = iett.Current; return true; } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// Groups tokens from a token stream into zero or more token definition groups. /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// public IEnumerator<TokenDefinitionGroup> GetEnumerator() { prv = cur = default(TdlTok); f_inhibit_next_advance = default(bool); iett = ie.GetEnumerator(); nest = new Stack<TdlTok>(); while (Advance()) { TokenDefinitionGroup tdg = new TokenDefinitionGroup(type, cur); if (tdg.tok_ident.t != TdlTok.Type.Identifier) TdlTokenizer.ErrorExit(tdg.tok_ident, "Expected an identifier."); if (!Advance()) TdlTokenizer.ErrorExit(tdg.tok_ident, "Unexpected end of file."); if (cur.t != TdlTok.Type.Define && cur.t != TdlTok.Type.Append) TdlTokenizer.ErrorExit(cur, "Expected := or :+"); tdg.f_append = (cur.t == TdlTok.Type.Append); if (!Advance()) TdlTokenizer.ErrorExit(cur, "Unexpected end of file in type definition."); while (cur.t == TdlTok.Type.Morphology) { /// Although supposedly forbidden by the LKB docs (Copestake 2002, p.131) some grammars get away with /// using morphology specifications in LexicalRule files. if (type != TokenDefinitionGroup.Type.LexicalRule) TdlTokenizer.ErrorExit(cur, "Cannot specify inflection rules in this file."); tdg.morph_subrules = tdg.morph_subrules ?? new List<MorphologySubrule>(); tdg.morph_subrules.AddRange((cur as TdlTokMorphology).subrules); if (!Advance()) TdlTokenizer.ErrorExit(cur, "Unexpected end of file after '%prefix' or '%suffix' in inflectional rule definition."); } // body of the definition nest.Clear(); while (true) { if (cur.t == TdlTok.Type.SquareOpen) { tdg.m_bfc = tdg.m_bfc ?? new List<BaseFeatConstraint>(); BaseFeatConstraint bfc_cur = new BaseFeatConstraint(); nest.Push(cur); while (true) { if (!Advance()) TdlTokenizer.ErrorExit(cur, "End of file searching for ']' to terminate feature structure. Unmatched '['."); if (cur.t == TdlTok.Type.Comma) { // count number of comma-separated items in each [ ... ] group nest.Peek().c_parts++; // could be '<', '<!', '[' if (nest.Count == 1) // must be '[' { tdg.m_bfc.Add(bfc_cur); bfc_cur = new BaseFeatConstraint(); continue; } } else if (cur.t == TdlTok.Type.Dot) { TdlTok pk = nest.Peek(); if (pk.t == TdlTok.Type.AngleOpen) { if (pk.c_parts > 0) TdlTokenizer.ErrorExit(cur, "Dotted pair notation < a . b > cannot have more than two parts."); pk.c_parts++; } } else if (cur.t == TdlTok.Type.AngleOpen || cur.t == TdlTok.Type.DifferenceListOpen || cur.t == TdlTok.Type.SquareOpen) nest.Push(cur); else if (cur.t == TdlTok.Type.AngleClose || cur.t == TdlTok.Type.DifferenceListClose || cur.t == TdlTok.Type.SquareClose) { if (nest.Count == 0) TdlTokenizer.ErrorExit(cur, "List not open; '{0}' is invalid.", cur); // the pending (open) grouping token TdlTok actual_open = nest.Pop(); // hacky way to get the expected closing token corresponding to the actual opening token TdlTok.Type expected_close = (TdlTok.Type)(actual_open.t + 1); // check that the opening token for the encountered closing token matches the opening token for the // pending grouping type if (cur.t != expected_close) TdlTokenizer.ErrorExit(cur, "Invalid nesting '{0}', expected {1}.", cur, expected_close.ToString()); // disallow empty parts if (prv.t == TdlTok.Type.Comma) TdlTokenizer.ErrorExit(cur, "Invalid ',', list {0} {1} cannot have empty parts.", actual_open, cur); // We counted zero commas in < >, which has zero parts, and we // counted (e.g.) n commas in < a , b > which has n+1 parts. // Furthermore, < a, b, ... > will be treated as having only two parts. if (prv.t != actual_open.t && (actual_open.t != TdlTok.Type.AngleOpen || prv.t != TdlTok.Type.Ellipsis)) actual_open.c_parts++; if (nest.Count == 0 && cur.t == TdlTok.Type.SquareClose) { tdg.m_bfc.Add(bfc_cur); break; } } bfc_cur.Add(cur); } } else if (cur.t == TdlTok.Type.String) { tdg.top_comment = cur.i_s; } else if (cur.t == TdlTok.Type.Identifier) { tdg.conj_par.Add(cur); } else if (cur.t == TdlTok.Type.Tag) { TdlTokenizer.ErrorExit(cur, String.Format("Cannot conjoin a co-reference tag (#{0}) at the top level of a TDL definition.", cur.i_s)); } else if (cur.t == TdlTok.Type.Morphology) { TdlTokenizer.ErrorExit(cur, String.Format("Inflectional prefix or suffix must occur after ':=' and before any parent types.", cur.i_s)); } else TdlTokenizer.ErrorExit(cur, String.Format("Unexpected token '{0}'.", cur)); if (!Advance()) TdlTokenizer.ErrorExit(cur, "Unexpected end of file in type definition."); if (cur.t == TdlTok.Type.Dot) // all TDL definitions must be terminated with a period { yield return tdg; break; } else if (cur.t == TdlTok.Type.SquareOpen) f_inhibit_next_advance = true; // LKB BUG -- allows feature structure without conjunction else if (cur.t != TdlTok.Type.Ampersand) TdlTokenizer.ErrorExit(cur, String.Format("'{0}' is not valid here. Expected '&' or '.'", cur)); if (!Advance()) TdlTokenizer.ErrorExit(cur, "Unexpected end of file in type definition."); } } } System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator() { throw new NotImplementedException(); } }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // // // //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// public struct TdlParsePos : IEnumerator<TdlTok> { public TdlParsePos(BaseFeatConstraint bfc) { this.bfc = bfc; this.i = -1; } readonly public BaseFeatConstraint bfc; // todo: ref pp, private i public int i; public bool Eof { get { return i >= bfc.Count; } } public TdlTok Current { get { return bfc[i]; } } public TdlTok.Type CurType { get { return bfc[i].t; } } public String CurString { get { return bfc[i].i_s; } } public bool MoveNext() { if (i < bfc.Count) i++; return i < bfc.Count; } public TdlTok MoveNextThrow(String error_msg = null) { if (error_msg == null) error_msg = "Incomplete TDL constraint specification."; if (i < bfc.Count) return bfc[++i]; else throw new TdlException(bfc[bfc.Count - 1], error_msg); } public void VerifyTokenType(TdlTok.Type tt, String msg = null) { if (msg == null) msg = String.Format("Expected '{0}'.", tt.Render() ?? "??"); if (CurType != tt) throw new TdlException(Current, msg); } public bool SkipToType(TdlTok.Type t) { while (true) { if (i >= bfc.Count) return false; if (bfc[i].t == t) return true; i++; } } public override String ToString() { if (bfc == null) return "(m_bfc == null)"; return bfc.Skip(i).StringJoin(" "); } public void Dispose() { } public void Reset() { i = -1; } object System.Collections.IEnumerator.Current { get { return Current; } } }; }