using System;
using System.Diagnostics;
using System.IO;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using miew.String;
using miew.Enumerable;
namespace agree
{
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
/// Error position information in a TDL file
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
public class ErrorPos
{
public ErrorPos(int l, int c, String f)
{
line = l;
col = c;
file = f;
}
public int line;
public int col;
public String file;
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
/// TDL token
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
public class TdlTok : ErrorPos
{
static public byte[] TokMap_Empty;
static public byte[] TokMap_Comma;
static public byte[] TokMap_SqCl;
static public byte[] TokMap_AngCl;
static public byte[] TokMap_AngCl_Comma;
static public byte[] TokMap_AngCl_Comma_Dot;
static public byte[] TokMap_DlsCl_Comma;
static int c_enum;
static TdlTok()
{
c_enum = Enum.GetValues(typeof(TdlTok.Type)).Cast<int>().Max() + 1;
TokMap_Empty = TdlTok.CreateTokenMap();
TokMap_SqCl = TdlTok.CreateTokenMap(TdlTok.Type.SquareClose);
TokMap_Comma = TdlTok.CreateTokenMap(TdlTok.Type.Comma);
TokMap_AngCl = TdlTok.CreateTokenMap(TdlTok.Type.AngleClose);
TokMap_AngCl_Comma = TdlTok.CreateTokenMap(TdlTok.Type.Comma, TdlTok.Type.AngleClose);
TokMap_AngCl_Comma_Dot = TdlTok.CreateTokenMap(TdlTok.Type.Comma, TdlTok.Type.AngleClose, TdlTok.Type.Dot);
TokMap_DlsCl_Comma = TdlTok.CreateTokenMap(TdlTok.Type.Comma, TdlTok.Type.DifferenceListClose);
}
static public byte[] CreateTokenMap(params Type[] args)
{
byte[] tok_map = Enumerable.Repeat<byte>(0, c_enum).ToArray();
foreach (Type t in args)
tok_map[(int)t] = 1;
return tok_map;
}
public enum Type
{
Identifier = 1,
String,
Tag, // #...
// single character tokens
Ampersand, // &
Dot, // .
Comma, // ,
SquareOpen, // [
SquareClose, // ]
AngleOpen, // <
AngleClose, // >
// atomic 2-character tokens (can contain whitespace because of matrix.tdl)
Append, // :+
Define, // := : = :<
DifferenceListOpen, // <! < !
DifferenceListClose, // !> ! >
Ellipsis, // ...
// morphology specification also uses inherited type
Morphology,
};
public TdlTok(int l, int c, String f, Type t)
: base(l, c, f)
{
this.t = t;
}
public TdlTok(int l, int c, String f, Type t, String s)
: base(l, c, f)
{
this.t = t;
this.i_s = String.Intern(s);
}
readonly public Type t;
public String i_s;
public int c_parts;
public String _Debug()
{
return String.Format("{0} {1} {2} {3} {4}", file, line, col, t.ToString(), i_s == null ? String.Empty : i_s);
}
public override String ToString()
{
String s = t.Render();
if (s != null)
return s;
if (this is TdlTokPrefix)
return " %prefix " + i_s + " ";
if (this is TdlTokSuffix)
return " %suffix " + i_s + " ";
switch (t)
{
case TdlTok.Type.Identifier:
return " " + i_s + " ";
case TdlTok.Type.String:
return " \"" + i_s + "\" ";
case TdlTok.Type.Tag:
return " #" + i_s + " ";
default:
throw new Exception();
}
}
};
[Serializable]
[DebuggerDisplay("{ToString(),nq}")]
public struct MorphologySubrule
{
public Regex regex;
public String replace;
public override string ToString()
{
return String.Format("{0} {1}", regex, replace);
}
};
abstract class TdlTokMorphology : TdlTok
{
public TdlTokMorphology(int l, int c, String f)
: base(l, c, f, Type.Morphology)
{
}
public List<MorphologySubrule> subrules = new List<MorphologySubrule>();
};
class TdlTokPrefix : TdlTokMorphology
{
public TdlTokPrefix(int l, int c, String f) : base(l, c, f) { }
};
class TdlTokSuffix : TdlTokMorphology
{
public TdlTokSuffix(int l, int c, String f) : base(l, c, f) { }
};
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
///
/// TDL token extension function(s)
///
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
public static class TdlTokExt
{
public static String Render(this TdlTok.Type t)
{
switch (t)
{
case TdlTok.Type.Ampersand:
return "&";
case TdlTok.Type.Dot:
return ".";
case TdlTok.Type.Comma:
return ",";
case TdlTok.Type.SquareOpen:
return "[";
case TdlTok.Type.SquareClose:
return "]";
case TdlTok.Type.AngleOpen:
return "<";
case TdlTok.Type.AngleClose:
return ">";
case TdlTok.Type.Append:
return ":+";
case TdlTok.Type.Define:
return ":="; // also, :<
case TdlTok.Type.DifferenceListOpen:
return "<!";
case TdlTok.Type.DifferenceListClose:
return "!>";
case TdlTok.Type.Ellipsis:
return "...";
default:
return null;
}
}
};
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
///
/// Base Feature or Base Constraint
///
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
[DebuggerDisplay("{ToString(),nq}")]
public class BaseFeatConstraint : List<TdlTok>
{
public override String ToString()
{
return this.StringJoin(" ");
}
};
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
///
/// Token Definition Group
///
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
public class TokenDefinitionGroup
{
public enum Type
{
Tdl,
GrammarRule,
LexicalRule,
Lexicon,
Label,
Start,
};
public TokenDefinitionGroup(Type type, TdlTok _t)
{
this.type = type;
this.tok_ident = _t;
}
/*readonly*/
public Type type;
readonly public TdlTok tok_ident; // identifier
public bool f_append; // false for := true for :+
public List<TdlTok> conj_par = new List<TdlTok>(); // conjoined parents
public List<BaseFeatConstraint> m_bfc; // contents of feature structure(s)
public String top_comment; // conjoined top-level comment
public List<MorphologySubrule> morph_subrules; // morphology subrules
public String Render()
{
String ret = String.Format("Type:\t{0}\n", tok_ident.i_s);
if (conj_par != null && conj_par.Count > 0)
ret += String.Format("Parents:\t{0}\n", conj_par.Select(e => e.i_s).StringJoin(" "));
if (m_bfc != null)
{
ret += "Constraint:\n\t";
ret += m_bfc;
}
if (top_comment != null)
ret += String.Format("Comment:\t{0}\n", top_comment);
return ret;
}
public String Identifier { get { return tok_ident.i_s; } }
};
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
/// Helper class for assembling a stream of TDL tokens into token definition groups
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
class TokenGrouper : IEnumerable<TokenDefinitionGroup>
{
readonly static Char[] one_space = new Char[] { ' ' };
public TokenGrouper(
TokenDefinitionGroup.Type type,
IEnumerable<TdlTok> ie)
{
this.type = type;
this.ie = ie;
}
readonly TokenDefinitionGroup.Type type;
readonly IEnumerable<TdlTok> ie;
IEnumerator<TdlTok> iett;
TdlTok prv;
TdlTok cur;
bool f_inhibit_next_advance;
Stack<TdlTok> nest;
bool Advance()
{
if (f_inhibit_next_advance)
f_inhibit_next_advance = false;
else if (!iett.MoveNext())
return false;
prv = cur;
cur = iett.Current;
return true;
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
/// Groups tokens from a token stream into zero or more token definition groups.
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
public IEnumerator<TokenDefinitionGroup> GetEnumerator()
{
prv = cur = default(TdlTok);
f_inhibit_next_advance = default(bool);
iett = ie.GetEnumerator();
nest = new Stack<TdlTok>();
while (Advance())
{
TokenDefinitionGroup tdg = new TokenDefinitionGroup(type, cur);
if (tdg.tok_ident.t != TdlTok.Type.Identifier)
TdlTokenizer.ErrorExit(tdg.tok_ident, "Expected an identifier.");
if (!Advance())
TdlTokenizer.ErrorExit(tdg.tok_ident, "Unexpected end of file.");
if (cur.t != TdlTok.Type.Define && cur.t != TdlTok.Type.Append)
TdlTokenizer.ErrorExit(cur, "Expected := or :+");
tdg.f_append = (cur.t == TdlTok.Type.Append);
if (!Advance())
TdlTokenizer.ErrorExit(cur, "Unexpected end of file in type definition.");
while (cur.t == TdlTok.Type.Morphology)
{
/// Although supposedly forbidden by the LKB docs (Copestake 2002, p.131) some grammars get away with
/// using morphology specifications in LexicalRule files.
if (type != TokenDefinitionGroup.Type.LexicalRule)
TdlTokenizer.ErrorExit(cur, "Cannot specify inflection rules in this file.");
tdg.morph_subrules = tdg.morph_subrules ?? new List<MorphologySubrule>();
tdg.morph_subrules.AddRange((cur as TdlTokMorphology).subrules);
if (!Advance())
TdlTokenizer.ErrorExit(cur, "Unexpected end of file after '%prefix' or '%suffix' in inflectional rule definition.");
}
// body of the definition
nest.Clear();
while (true)
{
if (cur.t == TdlTok.Type.SquareOpen)
{
tdg.m_bfc = tdg.m_bfc ?? new List<BaseFeatConstraint>();
BaseFeatConstraint bfc_cur = new BaseFeatConstraint();
nest.Push(cur);
while (true)
{
if (!Advance())
TdlTokenizer.ErrorExit(cur, "End of file searching for ']' to terminate feature structure. Unmatched '['.");
if (cur.t == TdlTok.Type.Comma)
{
// count number of comma-separated items in each [ ... ] group
nest.Peek().c_parts++; // could be '<', '<!', '['
if (nest.Count == 1) // must be '['
{
tdg.m_bfc.Add(bfc_cur);
bfc_cur = new BaseFeatConstraint();
continue;
}
}
else if (cur.t == TdlTok.Type.Dot)
{
TdlTok pk = nest.Peek();
if (pk.t == TdlTok.Type.AngleOpen)
{
if (pk.c_parts > 0)
TdlTokenizer.ErrorExit(cur, "Dotted pair notation < a . b > cannot have more than two parts.");
pk.c_parts++;
}
}
else if (cur.t == TdlTok.Type.AngleOpen || cur.t == TdlTok.Type.DifferenceListOpen || cur.t == TdlTok.Type.SquareOpen)
nest.Push(cur);
else if (cur.t == TdlTok.Type.AngleClose || cur.t == TdlTok.Type.DifferenceListClose || cur.t == TdlTok.Type.SquareClose)
{
if (nest.Count == 0)
TdlTokenizer.ErrorExit(cur, "List not open; '{0}' is invalid.", cur);
// the pending (open) grouping token
TdlTok actual_open = nest.Pop();
// hacky way to get the expected closing token corresponding to the actual opening token
TdlTok.Type expected_close = (TdlTok.Type)(actual_open.t + 1);
// check that the opening token for the encountered closing token matches the opening token for the
// pending grouping type
if (cur.t != expected_close)
TdlTokenizer.ErrorExit(cur, "Invalid nesting '{0}', expected {1}.", cur, expected_close.ToString());
// disallow empty parts
if (prv.t == TdlTok.Type.Comma)
TdlTokenizer.ErrorExit(cur, "Invalid ',', list {0} {1} cannot have empty parts.", actual_open, cur);
// We counted zero commas in < >, which has zero parts, and we
// counted (e.g.) n commas in < a , b > which has n+1 parts.
// Furthermore, < a, b, ... > will be treated as having only two parts.
if (prv.t != actual_open.t && (actual_open.t != TdlTok.Type.AngleOpen || prv.t != TdlTok.Type.Ellipsis))
actual_open.c_parts++;
if (nest.Count == 0 && cur.t == TdlTok.Type.SquareClose)
{
tdg.m_bfc.Add(bfc_cur);
break;
}
}
bfc_cur.Add(cur);
}
}
else if (cur.t == TdlTok.Type.String)
{
tdg.top_comment = cur.i_s;
}
else if (cur.t == TdlTok.Type.Identifier)
{
tdg.conj_par.Add(cur);
}
else if (cur.t == TdlTok.Type.Tag)
{
TdlTokenizer.ErrorExit(cur, String.Format("Cannot conjoin a co-reference tag (#{0}) at the top level of a TDL definition.", cur.i_s));
}
else if (cur.t == TdlTok.Type.Morphology)
{
TdlTokenizer.ErrorExit(cur, String.Format("Inflectional prefix or suffix must occur after ':=' and before any parent types.", cur.i_s));
}
else
TdlTokenizer.ErrorExit(cur, String.Format("Unexpected token '{0}'.", cur));
if (!Advance())
TdlTokenizer.ErrorExit(cur, "Unexpected end of file in type definition.");
if (cur.t == TdlTok.Type.Dot) // all TDL definitions must be terminated with a period
{
yield return tdg;
break;
}
else if (cur.t == TdlTok.Type.SquareOpen)
f_inhibit_next_advance = true; // LKB BUG -- allows feature structure without conjunction
else if (cur.t != TdlTok.Type.Ampersand)
TdlTokenizer.ErrorExit(cur, String.Format("'{0}' is not valid here. Expected '&' or '.'", cur));
if (!Advance())
TdlTokenizer.ErrorExit(cur, "Unexpected end of file in type definition.");
}
}
}
System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator()
{
throw new NotImplementedException();
}
};
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//
//
//
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
public struct TdlParsePos : IEnumerator<TdlTok>
{
public TdlParsePos(BaseFeatConstraint bfc)
{
this.bfc = bfc;
this.i = -1;
}
readonly public BaseFeatConstraint bfc;
// todo: ref pp, private i
public int i;
public bool Eof { get { return i >= bfc.Count; } }
public TdlTok Current { get { return bfc[i]; } }
public TdlTok.Type CurType { get { return bfc[i].t; } }
public String CurString { get { return bfc[i].i_s; } }
public bool MoveNext()
{
if (i < bfc.Count)
i++;
return i < bfc.Count;
}
public TdlTok MoveNextThrow(String error_msg = null)
{
if (error_msg == null)
error_msg = "Incomplete TDL constraint specification.";
if (i < bfc.Count)
return bfc[++i];
else
throw new TdlException(bfc[bfc.Count - 1], error_msg);
}
public void VerifyTokenType(TdlTok.Type tt, String msg = null)
{
if (msg == null)
msg = String.Format("Expected '{0}'.", tt.Render() ?? "??");
if (CurType != tt)
throw new TdlException(Current, msg);
}
public bool SkipToType(TdlTok.Type t)
{
while (true)
{
if (i >= bfc.Count)
return false;
if (bfc[i].t == t)
return true;
i++;
}
}
public override String ToString()
{
if (bfc == null)
return "(m_bfc == null)";
return bfc.Skip(i).StringJoin(" ");
}
public void Dispose() { }
public void Reset() { i = -1; }
object System.Collections.IEnumerator.Current { get { return Current; } }
};
}