using System; using System.Collections; using System.Collections.Generic; using System.Diagnostics; using System.Linq; using System.IO; using glue.Tokenization; using glue.Debugging; using glue.Extensions.Enumerable; using glue.Collections.ReadOnly; using glue.Collections.XSpinLock; namespace agree { /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// The 'count' of IParseChartEdge items encapsulated in the LexicalAnalysis enumerator is *not* necessarily the /// required size of the parse chart due to overlapping tokens or multiple tokens for a given chart position; /// use the ChartSize property for this. /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// [DebuggerDisplay("{ToString(),nq}")] public partial class LexicalAnalysis : IEnumerable<ParseChart.IParseChartToken>, IDisposable { [DebuggerBrowsable(DebuggerBrowsableState.Never)] readonly TokenSet ts; [DebuggerBrowsable(DebuggerBrowsableState.Never)] readonly Lexicon lex; [DebuggerBrowsable(DebuggerBrowsableState.Never)] readonly Tray tray; [DebuggerBrowsable(DebuggerBrowsableState.Never)] readonly GrammarParser gp; readonly List<AnalysisStack> token_analyses = new List<AnalysisStack>(); /// <summary> /// Because TfsEdges are referenced by multiple stacks within this LexicalAnalysis, we provide a method for their /// lifetime to be managed here. If the transform creates non-canonical TFSs, they are reigstered here to be /// cleaned up later. Canonical FS such as lexical rules should not be added. /// </summary> readonly List<TfsEdge> edges = new List<TfsEdge>(); public Lexicon Lexicon { get { return lex; } } public TokenSet SourceItem { get { return ts; } } public GrammarParser Parser { get { return gp; } } [DebuggerBrowsable(DebuggerBrowsableState.Never)] public String SourceText { get { return ts.Source.Text; } } [DebuggerBrowsable(DebuggerBrowsableState.Never)] public int ChartSize { get { return ts.MinimalSpanCount; } } public int c_unif = 0; /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// Performs pre-parsing analysis of the tokens in a tokenization hypothesis, including lexical and irregular /// stem lookup, application of morphological and lexical rules, filtering, etc. The hypothesis may be either /// single coverage (i.e. 'TokenizationHypothesis') or ad-hoc where token selection is deferred to the /// parsing phase (i.e. 'TokenizedString') /// /// Morpholexical analysis happens in two passes. The surface-to-stem inbound pass ("downtrace") permutes affixation-- /// regular or irregular--fanning downwards until one or more stems are reached. Then, for each discovered downtrace, /// an outbound pass follows the downtrace in reverse checking the license requirements of the downtrace (aborting /// if failing), interleaving non-affixing rules where possible, and permuting the skipping of downtrace elements. /// Thus, at each node of the outbound pass, zero or more stacks fan out upwards. /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// public LexicalAnalysis(Lexicon lex, TokenSet ts, Tray tray) { this.lex = lex; this.tray = tray; this.gp = lex.Grammar.Parser; this.ts = ts; /// Find zero or more analysis stacks for each source token and add them to a bag. Note that we are mostly operating /// from the surface form only, so the token 'tok' that is passed in is not referenced unless we need to check when /// postulating a multi-word stem. Otherwise the analyzer doesn't care about how the tokens are laid out, /// overlapping, or gapped in the source, and they are returned as a bag that is, in principle, unordered. foreach (TokenSet.Token tok in ts) { DownTrace dt = new DownTraceSurface(this, tok.Text); token_analyses.AddRange(dt.TryNonAffixingRulesOutbound(tok)); c_unif += dt.Sum(_dt => _dt.c_unif); } #if DEBUG TextWriter tw = lex.Grammar.Parser.DebugOutput; if (tw != null) { int i = 0; foreach (AnalysisStack stk in token_analyses) { tw.WriteLineColorSync("$red analysis #{0}", i++); int j = stk.Count; foreach (LexicalTransform lx in stk.AsEnumerable().Reverse()) { tw.WriteLineColorSync("\t{0,2} $darkyellow {1,-11}$ {2,19}\t{3,-20}", --j, "\"" + lx.CurrentForm.StringJoin(" ") + "\"", lx.GetType().Name, lx.license.Name); } tw.WriteLine(); } } #endif } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// DownTrace : inverted tree of morphological transforms from the surface (top) towards the stem (leaves) /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// [DebuggerDisplay("{form}")] abstract class DownTrace : IEnumerable<DownTrace> { internal readonly LexicalAnalysis la; protected readonly DownTrace m_prev; public int c_unif = 0; String form; public DownTrace(LexicalAnalysis la, DownTrace prev, String form) { this.la = la; this.m_prev = prev; this.form = form; } Lexicon Lexicon { get { return la.lex; } } Tray Tray { get { return la.tray; } } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// Stacks which permute all licensed orthographic changes are generated on the way down from the surface towards /// the stem, and recorded in an (inverted) tree of DownTrace objects. Here, when unwinding, non-affixing changes are /// inserted into these trees, possibly multiplying branches upwards at any point. /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////// public IEnumerable<AnalysisStack> TryNonAffixingRulesOutbound(TokenSet.Token tok) { return OutboundMultiplier(TryAffixingRulesInbound(tok)); } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// An arbitrary number of non-affixing rules can interleave. As long as we find any, multiply the returned /// stacks recursively upwards. /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////// IEnumerable<AnalysisStack> OutboundMultiplier(IEnumerable<AnalysisStack> seq) { foreach (AnalysisStack analysis_stack in seq) { foreach (var new_stk in OutboundMultiplier(FindNonAffixingRules(analysis_stack))) yield return new_stk; /// also return the stack without any non-affixing lexical rules applied /// todo: is this only valid if it matches the surface form? yield return analysis_stack; } } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// Return new stacks corresponding to any valid non-affixing lexical rules to the top of the specified stack. /// If no further transformation can be generated on this stack, return an empty sequence. /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////// IEnumerable<AnalysisStack> FindNonAffixingRules(AnalysisStack analysis_stack) { LexicalTransform lx = analysis_stack.Top; TfsEdge candidate = lx.feature_structure; IEnumerable<LexicalRule> rules_to_use; LexicalRule lxr = lx.license as LexicalRule; if (lxr is LexicalRule) rules_to_use = lxr.CompatibleKeyMothers.OfExactType<LexicalRule>(); else rules_to_use = Lexicon.non_morph_lexrules; foreach (LexicalRule lex_rule in rules_to_use) { /// try the unification Unification.Partial uh = new Unification.Partial(Tray); c_unif++; TfsEdge full, daughter = lex_rule.RuleDaughters[0]; if (uh.UnifyAndComplete(lex_rule.Contents, daughter, candidate, out full)) { la.edges.Add(full); /// we are outbound and need to fan out upwards so make a copy of the stack below, add the /// new result, and return the "extra" result AnalysisStack newstk = new AnalysisStack(analysis_stack); newstk.Add(new LxNonAffixing(newstk.Top.CurrentForm, lex_rule, full)); yield return newstk; } } } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// Return zero or more lexical analysis stacks for the specified surface form. Each lexical analysis stack that is /// gathered has exactly one stem, and is built from the stem (at index 0) "upwards." The source token that is /// supplied is advisory only in the sense that its text does not reflect the current state of morphological /// processing. It is only used for matching the stem against multi-word lexical entries. /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////// IEnumerable<AnalysisStack> TryAffixingRulesInbound(TokenSet.Token tok) { /// 0. mere availability of an irregular form will block all regular affixing rules if /// irregular-forms-only-p is enabled. List<Lexicon.Irreg> rg_irreg_info = null; if (Lexicon.irreg_dict != null && !this.HasIrregular) { if (Lexicon.irreg_dict.TryGetValue(form, out rg_irreg_info)) Debug.Assert(rg_irreg_info.Count > 0); } /// 1. Irregular affix (or non-affixing transform) /// At this point we are only allowing a single irregular inflection to be added to the downtrace, but /// this is now easy to reconfigure. if (rg_irreg_info != null) { foreach (Lexicon.Irreg irreg_info in rg_irreg_info) { LexicalRule lex_rule = irreg_info.rule; DownTrace dt = new DownTraceIrregular(this, irreg_info.stem); /// propagate downwards towards stem first, and only unify with the stacks that come back, if any foreach (AnalysisStack analysis_stack in dt.TryNonAffixingRulesOutbound(tok)) { /// rule check pre-filter LexicalRule lr = analysis_stack.Top.license as LexicalRule; if (lr == null || lr.CompatibleKeyMothers.Contains(lex_rule)) { /// try the unification Unification.Partial uh = new Unification.Partial(Tray); c_unif++; TfsEdge full; if (uh.UnifyAndComplete(lex_rule.Contents, lex_rule.RuleDaughters[0], analysis_stack.Top.feature_structure, out full)) { la.edges.Add(full); analysis_stack.Add(new LxIrregular(form, lex_rule, full)); yield return analysis_stack; } } } c_unif += dt.c_unif; } } /// 2. Regular, affixing lexical rules if (rg_irreg_info == null || !la.Parser.Config.IrregularFormsOnly) { foreach (MorphologicalRule morph_rule in Lexicon.morph_lexrules) { foreach (MorphologySubrule subrule in morph_rule.Subrules) { String newform = subrule.regex.Replace(form, subrule.replace); if (newform != form) { DownTrace dt = new DownTraceRegular(this, newform); /// propagate downwards towards stem first, and only unify with the stacks that come back, if any foreach (AnalysisStack analysis_stack in dt.TryNonAffixingRulesOutbound(tok)) { /// rule check pre-filter LexicalRule lr = analysis_stack.Top.license as LexicalRule; if (lr == null || lr.CompatibleKeyMothers.Contains(morph_rule)) { /// try the unification Unification.Partial uh = new Unification.Partial(Tray); c_unif++; TfsEdge full; if (uh.UnifyAndComplete(morph_rule.Contents, morph_rule.RuleDaughters[0], analysis_stack.Top.feature_structure, out full)) { la.edges.Add(full); analysis_stack.Add(new LxRegularAffixing(form, morph_rule, full)); yield return analysis_stack; } } } c_unif += dt.c_unif; } } } } /// 3. Stem bool f_did_expand; TokenSet.Token[][] rg_seq = null; foreach (LexicalEntry le in Lexicon[form]) { int lex_entry_arity = le.Lemmata.Count; if (lex_entry_arity > 1) { rg_seq = rg_seq ?? tok.RightAdjacencySequences().Select(ie => ie.ToArray()).ToArray(); if (rg_seq.Length > 1) { Nop.CodeCoverage(); // probably need to filter the sequences for distinctness here (array-value equality) since there may be dups after truncating //var yz = rg_seq.Where(rgt => rgt.Length >= lex_entry_arity - 1).Select(ie => ie.Take(lex_entry_arity - 1).ToArray()).ToArray(); //if (yz.Distinct(rgt => rgt.StringJoin("")).Count() != yz.Length) } /// ensure that the all parts of a multi-word lexical entry match an available adjacency /// sequence in the surface input foreach (var adj_seq in rg_seq) { /// first token, which is not included in adj_seq, has already been checked if (1 + adj_seq.Length >= lex_entry_arity) { for (int i = 1; i < lex_entry_arity; i++) if (!Lexicon.TextComparer.Equals(adj_seq[i - 1].Text, le.Lemmata[i])) goto no_match; Span sp = new Span(tok.TokenSpan.StartIndex, adj_seq[lex_entry_arity - 2].TokenSpan.EndIndex); yield return new AnalysisStack(la, new LxMultiTokenStem(sp, le, out f_did_expand)); if (f_did_expand) c_unif++; } no_match: ; } } else { yield return new AnalysisStack(la, new LxSingleTokenStem(tok, le, out f_did_expand)); if (f_did_expand) c_unif++; } } /// 4. Check non-initial lemma in multi-word lexeme (only if there is a spelling change, otherwise handled /// by step #3. Using HasSpellingChange means non-affixing irregulars are blocked here, so "world series" /// will not generate a stack for plural 'series' in this step. If the requirement were changed to HasTransform /// instead, the plural "world series" would be generated but at the expense of creating also another singular /// one, duplicating a (presumably) singular one from step #3. The duplicates scenario seems like the worse /// problem. if (HasSpellingChange) { foreach (Lexicon.NonInitialMwe mwe in Lexicon.mwe_lookup[form]) { LexicalEntry le = mwe.lex_entry; /// check to the left. There will always be at least one. foreach (var adj_left in tok.LeftAdjacencySequences().Select(ie => ie.TakeLast(mwe.index).ToArray()).Where(rgt => rgt.Length == mwe.index)) { for (int j = 0; j < mwe.index; j++) if (!Lexicon.TextComparer.Equals(adj_left[j].Text, le.Lemmata[j])) goto no_match; /// check to the right. There may be zero or more int rem = le.Lemmata.Count - (mwe.index + 1); if (rem == 0) { Span sp = new Span(adj_left[0].TokenSpan.StartIndex, tok.TokenSpan.EndIndex); yield return new AnalysisStack(la, new LxMultiTokenStem(sp, le, out f_did_expand)); if (f_did_expand) c_unif++; } else { foreach (var adj_right in tok.RightAdjacencySequences().Select(ie => ie.Take(rem).ToArray()).Where(rgt => rgt.Length == rem)) { for (int j = 0; j < rem; j++) if (!Lexicon.TextComparer.Equals(adj_right[j].Text, le.Lemmata[mwe.index + 1 + j])) goto no_match; Nop.CodeCoverage(); Span sp = new Span(adj_left[0].TokenSpan.StartIndex, adj_right[rem - 1].TokenSpan.EndIndex); yield return new AnalysisStack(la, new LxMultiTokenStem(sp, le, out f_did_expand)); if (f_did_expand) c_unif++; } } no_match: ; } } } } /// <summary> /// Are there any irregular inflections in this downtrace? /// </summary> public bool HasIrregular { get { return this.Any(dt => dt is DownTraceIrregular); } } /// <summary> /// Are there any affixation in this downtrace? /// </summary> public bool HasTransform { get { return this.Any(dt => dt is DownTraceTransform); } } /// <summary> /// Are there any spelling changes in this downtrace? /// </summary> public bool HasSpellingChange { get { return this.Any(dt => dt.form != form); } } /// <summary> /// Each DownTrace is polymorphic with the linked list that it is at the end of. This linked list which represents /// the path so far--in a tree of all such paths--from the surface token in the current analysis descent. /// </summary> public IEnumerator<DownTrace> GetEnumerator() { if (m_prev != null) foreach (DownTrace prev in m_prev) yield return prev; yield return this; } IEnumerator IEnumerable.GetEnumerator() { return GetEnumerator(); } #if DEBUG [DebuggerBrowsable(DebuggerBrowsableState.RootHidden)] protected DownTrace[] _dbg_display { get { return this.ToArray(); } } #endif }; /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// DownTrace implementations: /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// class DownTraceSurface : DownTrace { public DownTraceSurface(LexicalAnalysis la, String form) : base(la, null, form) { } }; abstract class DownTraceTransform : DownTrace { public DownTraceTransform(DownTrace prev, String form) : base(prev.la, prev, form) { } }; class DownTraceIrregular : DownTraceTransform { public DownTraceIrregular(DownTrace prev, String form) : base(prev, form) { } }; class DownTraceRegular : DownTraceTransform { public DownTraceRegular(DownTrace prev, String form) : base(prev, form) { } }; /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// mundane: /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// public void Dispose() { foreach (TfsEdge te in edges) te.Dispose(); edges.Clear(); } public IEnumerator<ParseChart.IParseChartToken> GetEnumerator() { foreach (var ipce in token_analyses) yield return ipce; } IEnumerator IEnumerable.GetEnumerator() { return GetEnumerator(); } public override string ToString() { return String.Format("tokens: {0} chart size: {1} [{2}]", token_analyses.Count, ChartSize, ts.Source.Text); } #if DEBUG [DebuggerBrowsable(DebuggerBrowsableState.RootHidden)] public AnalysisStack[] _RGTOKA { get { return token_analyses.ToArray(); } } #endif /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// AnalysisStacks are built from the bottom up only, so you must supply the stem here in order to create /// a stack. There must be exactly one stem. /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// [DebuggerDisplay("{ToString(),nq}")] public class AnalysisStack : List<LexicalTransform>, ParseChart.IParseChartToken { [DebuggerBrowsable(DebuggerBrowsableState.Never)] readonly LexicalAnalysis analysis; readonly Tray tray; readonly Lexicon lex; public AnalysisStack(LexicalAnalysis analysis, LxStem mx) { this.analysis = analysis; this.tray = analysis.tray; this.lex = analysis.lex; base.Add(mx); } public AnalysisStack(AnalysisStack to_copy) { this.analysis = to_copy.analysis; this.AddRange(to_copy); } [DebuggerBrowsable(DebuggerBrowsableState.Never)] public LxStem Stem { get { Debug.Assert(this[0] is LxStem); return (LxStem)this[0]; } } [DebuggerBrowsable(DebuggerBrowsableState.Never)] public LexicalTransform Top { get { return this[this.Count - 1]; } } [DebuggerBrowsable(DebuggerBrowsableState.Never)] public TfsEdge Contents { get { return Top.feature_structure; } } [DebuggerBrowsable(DebuggerBrowsableState.Never)] public Span ChartSpan { get { return Stem.ChartSpan; } } public string Text { get { return analysis.SourceItem.Source.MinimalSpanText(ChartSpan); } } public bool SpinCompare(TfsEdge other) { return this.Contents.Type == other.Type; } public void Dispose() { } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// IAtomicallySequencable(ParseChart.IParseChartToken) implementation /// To be allowed into the parse chart, you must be able to participate in sequence list atomic stamping /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////// int i_seq = unchecked((int)uint.MaxValue); // should be ast.InertValue public uint SequenceId { get { return (uint)i_seq; } } public void StampSequence(AtomicSequencer ast) { ast.StampLocation(ref i_seq); } public bool SetInert(AtomicSequencer ast) { return ast.SetInert(ref i_seq); } public bool IsInert(AtomicSequencer ast) { return (uint)i_seq == ast.InertValue; } public ISysObj License { get { return Top.license; } } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////// public override string ToString() { //return String.Format("{0} transforms: {1} stem: {2} fs: {3}", this.ChartSpan, this.Count, this.Stem, this.Self); return String.Format("{0} {1} {2}", this.ChartSpan, this.StringJoin("->"), this.Contents.Type.Name); } #if DEBUG [DebuggerBrowsable(DebuggerBrowsableState.RootHidden)] public LexicalTransform[] _TRANSFORMS { get { return this.AsEnumerable().Reverse().ToArray(); } } #endif }; }; }