using System;
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.IO;
using glue.Tokenization;
using glue.Debugging;
using glue.Extensions.Enumerable;
using glue.Collections.ReadOnly;
using glue.Collections.XSpinLock;
namespace agree
{
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
/// The 'count' of IParseChartEdge items encapsulated in the LexicalAnalysis enumerator is *not* necessarily the
/// required size of the parse chart due to overlapping tokens or multiple tokens for a given chart position;
/// use the ChartSize property for this.
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
[DebuggerDisplay("{ToString(),nq}")]
public partial class LexicalAnalysis : IEnumerable<ParseChart.IParseChartToken>, IDisposable
{
[DebuggerBrowsable(DebuggerBrowsableState.Never)]
readonly TokenSet ts;
[DebuggerBrowsable(DebuggerBrowsableState.Never)]
readonly Lexicon lex;
[DebuggerBrowsable(DebuggerBrowsableState.Never)]
readonly Tray tray;
[DebuggerBrowsable(DebuggerBrowsableState.Never)]
readonly GrammarParser gp;
readonly List<AnalysisStack> token_analyses = new List<AnalysisStack>();
/// <summary>
/// Because TfsEdges are referenced by multiple stacks within this LexicalAnalysis, we provide a method for their
/// lifetime to be managed here. If the transform creates non-canonical TFSs, they are reigstered here to be
/// cleaned up later. Canonical FS such as lexical rules should not be added.
/// </summary>
readonly List<TfsEdge> edges = new List<TfsEdge>();
public Lexicon Lexicon { get { return lex; } }
public TokenSet SourceItem { get { return ts; } }
public GrammarParser Parser { get { return gp; } }
[DebuggerBrowsable(DebuggerBrowsableState.Never)]
public String SourceText { get { return ts.Source.Text; } }
[DebuggerBrowsable(DebuggerBrowsableState.Never)]
public int ChartSize { get { return ts.MinimalSpanCount; } }
public int c_unif = 0;
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
/// Performs pre-parsing analysis of the tokens in a tokenization hypothesis, including lexical and irregular
/// stem lookup, application of morphological and lexical rules, filtering, etc. The hypothesis may be either
/// single coverage (i.e. 'TokenizationHypothesis') or ad-hoc where token selection is deferred to the
/// parsing phase (i.e. 'TokenizedString')
///
/// Morpholexical analysis happens in two passes. The surface-to-stem inbound pass ("downtrace") permutes affixation--
/// regular or irregular--fanning downwards until one or more stems are reached. Then, for each discovered downtrace,
/// an outbound pass follows the downtrace in reverse checking the license requirements of the downtrace (aborting
/// if failing), interleaving non-affixing rules where possible, and permuting the skipping of downtrace elements.
/// Thus, at each node of the outbound pass, zero or more stacks fan out upwards.
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
public LexicalAnalysis(Lexicon lex, TokenSet ts, Tray tray)
{
this.lex = lex;
this.tray = tray;
this.gp = lex.Grammar.Parser;
this.ts = ts;
/// Find zero or more analysis stacks for each source token and add them to a bag. Note that we are mostly operating
/// from the surface form only, so the token 'tok' that is passed in is not referenced unless we need to check when
/// postulating a multi-word stem. Otherwise the analyzer doesn't care about how the tokens are laid out,
/// overlapping, or gapped in the source, and they are returned as a bag that is, in principle, unordered.
foreach (TokenSet.Token tok in ts)
{
DownTrace dt = new DownTraceSurface(this, tok.Text);
token_analyses.AddRange(dt.TryNonAffixingRulesOutbound(tok));
c_unif += dt.Sum(_dt => _dt.c_unif);
}
#if DEBUG
TextWriter tw = lex.Grammar.Parser.DebugOutput;
if (tw != null)
{
int i = 0;
foreach (AnalysisStack stk in token_analyses)
{
tw.WriteLineColorSync("$red analysis #{0}", i++);
int j = stk.Count;
foreach (LexicalTransform lx in stk.AsEnumerable().Reverse())
{
tw.WriteLineColorSync("\t{0,2} $darkyellow {1,-11}$ {2,19}\t{3,-20}",
--j,
"\"" + lx.CurrentForm.StringJoin(" ") + "\"",
lx.GetType().Name,
lx.license.Name);
}
tw.WriteLine();
}
}
#endif
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
/// DownTrace : inverted tree of morphological transforms from the surface (top) towards the stem (leaves)
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
[DebuggerDisplay("{form}")]
abstract class DownTrace : IEnumerable<DownTrace>
{
internal readonly LexicalAnalysis la;
protected readonly DownTrace m_prev;
public int c_unif = 0;
String form;
public DownTrace(LexicalAnalysis la, DownTrace prev, String form)
{
this.la = la;
this.m_prev = prev;
this.form = form;
}
Lexicon Lexicon { get { return la.lex; } }
Tray Tray { get { return la.tray; } }
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
/// Stacks which permute all licensed orthographic changes are generated on the way down from the surface towards
/// the stem, and recorded in an (inverted) tree of DownTrace objects. Here, when unwinding, non-affixing changes are
/// inserted into these trees, possibly multiplying branches upwards at any point.
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
public IEnumerable<AnalysisStack> TryNonAffixingRulesOutbound(TokenSet.Token tok)
{
return OutboundMultiplier(TryAffixingRulesInbound(tok));
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
/// An arbitrary number of non-affixing rules can interleave. As long as we find any, multiply the returned
/// stacks recursively upwards.
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
IEnumerable<AnalysisStack> OutboundMultiplier(IEnumerable<AnalysisStack> seq)
{
foreach (AnalysisStack analysis_stack in seq)
{
foreach (var new_stk in OutboundMultiplier(FindNonAffixingRules(analysis_stack)))
yield return new_stk;
/// also return the stack without any non-affixing lexical rules applied
/// todo: is this only valid if it matches the surface form?
yield return analysis_stack;
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
/// Return new stacks corresponding to any valid non-affixing lexical rules to the top of the specified stack.
/// If no further transformation can be generated on this stack, return an empty sequence.
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
IEnumerable<AnalysisStack> FindNonAffixingRules(AnalysisStack analysis_stack)
{
LexicalTransform lx = analysis_stack.Top;
TfsEdge candidate = lx.feature_structure;
IEnumerable<LexicalRule> rules_to_use;
LexicalRule lxr = lx.license as LexicalRule;
if (lxr is LexicalRule)
rules_to_use = lxr.CompatibleKeyMothers.OfExactType<LexicalRule>();
else
rules_to_use = Lexicon.non_morph_lexrules;
foreach (LexicalRule lex_rule in rules_to_use)
{
/// try the unification
Unification.Partial uh = new Unification.Partial(Tray);
c_unif++;
TfsEdge full, daughter = lex_rule.RuleDaughters[0];
if (uh.UnifyAndComplete(lex_rule.Contents, daughter, candidate, out full))
{
la.edges.Add(full);
/// we are outbound and need to fan out upwards so make a copy of the stack below, add the
/// new result, and return the "extra" result
AnalysisStack newstk = new AnalysisStack(analysis_stack);
newstk.Add(new LxNonAffixing(newstk.Top.CurrentForm, lex_rule, full));
yield return newstk;
}
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
/// Return zero or more lexical analysis stacks for the specified surface form. Each lexical analysis stack that is
/// gathered has exactly one stem, and is built from the stem (at index 0) "upwards." The source token that is
/// supplied is advisory only in the sense that its text does not reflect the current state of morphological
/// processing. It is only used for matching the stem against multi-word lexical entries.
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
IEnumerable<AnalysisStack> TryAffixingRulesInbound(TokenSet.Token tok)
{
/// 0. mere availability of an irregular form will block all regular affixing rules if
/// irregular-forms-only-p is enabled.
List<Lexicon.Irreg> rg_irreg_info = null;
if (Lexicon.irreg_dict != null && !this.HasIrregular)
{
if (Lexicon.irreg_dict.TryGetValue(form, out rg_irreg_info))
Debug.Assert(rg_irreg_info.Count > 0);
}
/// 1. Irregular affix (or non-affixing transform)
/// At this point we are only allowing a single irregular inflection to be added to the downtrace, but
/// this is now easy to reconfigure.
if (rg_irreg_info != null)
{
foreach (Lexicon.Irreg irreg_info in rg_irreg_info)
{
LexicalRule lex_rule = irreg_info.rule;
DownTrace dt = new DownTraceIrregular(this, irreg_info.stem);
/// propagate downwards towards stem first, and only unify with the stacks that come back, if any
foreach (AnalysisStack analysis_stack in dt.TryNonAffixingRulesOutbound(tok))
{
/// rule check pre-filter
LexicalRule lr = analysis_stack.Top.license as LexicalRule;
if (lr == null || lr.CompatibleKeyMothers.Contains(lex_rule))
{
/// try the unification
Unification.Partial uh = new Unification.Partial(Tray);
c_unif++;
TfsEdge full;
if (uh.UnifyAndComplete(lex_rule.Contents,
lex_rule.RuleDaughters[0],
analysis_stack.Top.feature_structure,
out full))
{
la.edges.Add(full);
analysis_stack.Add(new LxIrregular(form, lex_rule, full));
yield return analysis_stack;
}
}
}
c_unif += dt.c_unif;
}
}
/// 2. Regular, affixing lexical rules
if (rg_irreg_info == null || !la.Parser.Config.IrregularFormsOnly)
{
foreach (MorphologicalRule morph_rule in Lexicon.morph_lexrules)
{
foreach (MorphologySubrule subrule in morph_rule.Subrules)
{
String newform = subrule.regex.Replace(form, subrule.replace);
if (newform != form)
{
DownTrace dt = new DownTraceRegular(this, newform);
/// propagate downwards towards stem first, and only unify with the stacks that come back, if any
foreach (AnalysisStack analysis_stack in dt.TryNonAffixingRulesOutbound(tok))
{
/// rule check pre-filter
LexicalRule lr = analysis_stack.Top.license as LexicalRule;
if (lr == null || lr.CompatibleKeyMothers.Contains(morph_rule))
{
/// try the unification
Unification.Partial uh = new Unification.Partial(Tray);
c_unif++;
TfsEdge full;
if (uh.UnifyAndComplete(morph_rule.Contents,
morph_rule.RuleDaughters[0],
analysis_stack.Top.feature_structure,
out full))
{
la.edges.Add(full);
analysis_stack.Add(new LxRegularAffixing(form, morph_rule, full));
yield return analysis_stack;
}
}
}
c_unif += dt.c_unif;
}
}
}
}
/// 3. Stem
bool f_did_expand;
TokenSet.Token[][] rg_seq = null;
foreach (LexicalEntry le in Lexicon[form])
{
int lex_entry_arity = le.Lemmata.Count;
if (lex_entry_arity > 1)
{
rg_seq = rg_seq ?? tok.RightAdjacencySequences().Select(ie => ie.ToArray()).ToArray();
if (rg_seq.Length > 1)
{
Nop.CodeCoverage();
// probably need to filter the sequences for distinctness here (array-value equality) since there may be dups after truncating
//var yz = rg_seq.Where(rgt => rgt.Length >= lex_entry_arity - 1).Select(ie => ie.Take(lex_entry_arity - 1).ToArray()).ToArray();
//if (yz.Distinct(rgt => rgt.StringJoin("")).Count() != yz.Length)
}
/// ensure that the all parts of a multi-word lexical entry match an available adjacency
/// sequence in the surface input
foreach (var adj_seq in rg_seq)
{
/// first token, which is not included in adj_seq, has already been checked
if (1 + adj_seq.Length >= lex_entry_arity)
{
for (int i = 1; i < lex_entry_arity; i++)
if (!Lexicon.TextComparer.Equals(adj_seq[i - 1].Text, le.Lemmata[i]))
goto no_match;
Span sp = new Span(tok.TokenSpan.StartIndex, adj_seq[lex_entry_arity - 2].TokenSpan.EndIndex);
yield return new AnalysisStack(la, new LxMultiTokenStem(sp, le, out f_did_expand));
if (f_did_expand)
c_unif++;
}
no_match:
;
}
}
else
{
yield return new AnalysisStack(la, new LxSingleTokenStem(tok, le, out f_did_expand));
if (f_did_expand)
c_unif++;
}
}
/// 4. Check non-initial lemma in multi-word lexeme (only if there is a spelling change, otherwise handled
/// by step #3. Using HasSpellingChange means non-affixing irregulars are blocked here, so "world series"
/// will not generate a stack for plural 'series' in this step. If the requirement were changed to HasTransform
/// instead, the plural "world series" would be generated but at the expense of creating also another singular
/// one, duplicating a (presumably) singular one from step #3. The duplicates scenario seems like the worse
/// problem.
if (HasSpellingChange)
{
foreach (Lexicon.NonInitialMwe mwe in Lexicon.mwe_lookup[form])
{
LexicalEntry le = mwe.lex_entry;
/// check to the left. There will always be at least one.
foreach (var adj_left in tok.LeftAdjacencySequences().Select(ie => ie.TakeLast(mwe.index).ToArray()).Where(rgt => rgt.Length == mwe.index))
{
for (int j = 0; j < mwe.index; j++)
if (!Lexicon.TextComparer.Equals(adj_left[j].Text, le.Lemmata[j]))
goto no_match;
/// check to the right. There may be zero or more
int rem = le.Lemmata.Count - (mwe.index + 1);
if (rem == 0)
{
Span sp = new Span(adj_left[0].TokenSpan.StartIndex, tok.TokenSpan.EndIndex);
yield return new AnalysisStack(la, new LxMultiTokenStem(sp, le, out f_did_expand));
if (f_did_expand)
c_unif++;
}
else
{
foreach (var adj_right in tok.RightAdjacencySequences().Select(ie => ie.Take(rem).ToArray()).Where(rgt => rgt.Length == rem))
{
for (int j = 0; j < rem; j++)
if (!Lexicon.TextComparer.Equals(adj_right[j].Text, le.Lemmata[mwe.index + 1 + j]))
goto no_match;
Nop.CodeCoverage();
Span sp = new Span(adj_left[0].TokenSpan.StartIndex, adj_right[rem - 1].TokenSpan.EndIndex);
yield return new AnalysisStack(la, new LxMultiTokenStem(sp, le, out f_did_expand));
if (f_did_expand)
c_unif++;
}
}
no_match:
;
}
}
}
}
/// <summary>
/// Are there any irregular inflections in this downtrace?
/// </summary>
public bool HasIrregular { get { return this.Any(dt => dt is DownTraceIrregular); } }
/// <summary>
/// Are there any affixation in this downtrace?
/// </summary>
public bool HasTransform { get { return this.Any(dt => dt is DownTraceTransform); } }
/// <summary>
/// Are there any spelling changes in this downtrace?
/// </summary>
public bool HasSpellingChange { get { return this.Any(dt => dt.form != form); } }
/// <summary>
/// Each DownTrace is polymorphic with the linked list that it is at the end of. This linked list which represents
/// the path so far--in a tree of all such paths--from the surface token in the current analysis descent.
/// </summary>
public IEnumerator<DownTrace> GetEnumerator()
{
if (m_prev != null)
foreach (DownTrace prev in m_prev)
yield return prev;
yield return this;
}
IEnumerator IEnumerable.GetEnumerator() { return GetEnumerator(); }
#if DEBUG
[DebuggerBrowsable(DebuggerBrowsableState.RootHidden)]
protected DownTrace[] _dbg_display { get { return this.ToArray(); } }
#endif
};
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
/// DownTrace implementations:
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
class DownTraceSurface : DownTrace
{
public DownTraceSurface(LexicalAnalysis la, String form) : base(la, null, form) { }
};
abstract class DownTraceTransform : DownTrace
{
public DownTraceTransform(DownTrace prev, String form) : base(prev.la, prev, form) { }
};
class DownTraceIrregular : DownTraceTransform
{
public DownTraceIrregular(DownTrace prev, String form) : base(prev, form) { }
};
class DownTraceRegular : DownTraceTransform
{
public DownTraceRegular(DownTrace prev, String form) : base(prev, form) { }
};
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
/// mundane:
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
public void Dispose()
{
foreach (TfsEdge te in edges)
te.Dispose();
edges.Clear();
}
public IEnumerator<ParseChart.IParseChartToken> GetEnumerator()
{
foreach (var ipce in token_analyses)
yield return ipce;
}
IEnumerator IEnumerable.GetEnumerator() { return GetEnumerator(); }
public override string ToString()
{
return String.Format("tokens: {0} chart size: {1} [{2}]", token_analyses.Count, ChartSize, ts.Source.Text);
}
#if DEBUG
[DebuggerBrowsable(DebuggerBrowsableState.RootHidden)]
public AnalysisStack[] _RGTOKA { get { return token_analyses.ToArray(); } }
#endif
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
/// AnalysisStacks are built from the bottom up only, so you must supply the stem here in order to create
/// a stack. There must be exactly one stem.
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
[DebuggerDisplay("{ToString(),nq}")]
public class AnalysisStack : List<LexicalTransform>, ParseChart.IParseChartToken
{
[DebuggerBrowsable(DebuggerBrowsableState.Never)]
readonly LexicalAnalysis analysis;
readonly Tray tray;
readonly Lexicon lex;
public AnalysisStack(LexicalAnalysis analysis, LxStem mx)
{
this.analysis = analysis;
this.tray = analysis.tray;
this.lex = analysis.lex;
base.Add(mx);
}
public AnalysisStack(AnalysisStack to_copy)
{
this.analysis = to_copy.analysis;
this.AddRange(to_copy);
}
[DebuggerBrowsable(DebuggerBrowsableState.Never)]
public LxStem Stem
{
get
{
Debug.Assert(this[0] is LxStem);
return (LxStem)this[0];
}
}
[DebuggerBrowsable(DebuggerBrowsableState.Never)]
public LexicalTransform Top { get { return this[this.Count - 1]; } }
[DebuggerBrowsable(DebuggerBrowsableState.Never)]
public TfsEdge Contents { get { return Top.feature_structure; } }
[DebuggerBrowsable(DebuggerBrowsableState.Never)]
public Span ChartSpan { get { return Stem.ChartSpan; } }
public string Text { get { return analysis.SourceItem.Source.MinimalSpanText(ChartSpan); } }
public bool SpinCompare(TfsEdge other)
{
return this.Contents.Type == other.Type;
}
public void Dispose() { }
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
/// IAtomicallySequencable(ParseChart.IParseChartToken) implementation
/// To be allowed into the parse chart, you must be able to participate in sequence list atomic stamping
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int i_seq = unchecked((int)uint.MaxValue); // should be ast.InertValue
public uint SequenceId
{
get { return (uint)i_seq; }
}
public void StampSequence(AtomicSequencer ast)
{
ast.StampLocation(ref i_seq);
}
public bool SetInert(AtomicSequencer ast)
{
return ast.SetInert(ref i_seq);
}
public bool IsInert(AtomicSequencer ast)
{
return (uint)i_seq == ast.InertValue;
}
public ISysObj License
{
get { return Top.license; }
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
///
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
public override string ToString()
{
//return String.Format("{0} transforms: {1} stem: {2} fs: {3}", this.ChartSpan, this.Count, this.Stem, this.Self);
return String.Format("{0} {1} {2}", this.ChartSpan, this.StringJoin("->"), this.Contents.Type.Name);
}
#if DEBUG
[DebuggerBrowsable(DebuggerBrowsableState.RootHidden)]
public LexicalTransform[] _TRANSFORMS
{
get { return this.AsEnumerable().Reverse().ToArray(); }
}
#endif
};
};
}