using System.Runtime.Serialization.Formatters.Binary; using System.Diagnostics; using System.IO; using System; using System.Collections.Generic; using System.Linq; using System.Threading; using System.Threading.Tasks; using miew.Enumerable; using miew.Tokenization; namespace agree { /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// [Serializable] public class Lexicon : ILookup<String, LexicalEntry> { /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// public struct Irreg { public String stem; public LexicalRule rule; }; [DebuggerDisplay("ix: {index} {lex_entry.ToString(),nq}")] public struct NonInitialMwe { public int index; public LexicalEntry lex_entry; }; /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// public Lexicon(Grammar g, IEnumerable<LexicalEntry> lex_entries, List<GrammarFileSet.IrregInfo> irregs) { this.g = g; this.lex_entries = lex_entries; #if false text_comparer = g.tm.config.options.HasFlag(Globals.Options.CaseSensitive) ? StringComparer.InvariantCulture : StringComparer.InvariantCultureIgnoreCase; #else text_comparer = StringComparer.InvariantCultureIgnoreCase; #endif List<LexicalEntry> mw = new List<LexicalEntry>(); lex_lookup = MultiWordFork(lex_entries, mw).ToLookup(lex => lex.Lemmata[0], text_comparer); mwe_lookup = mw .SelectMany(e => e.Lemmata.Skip(1).Select((w, ix) => new { w, index = ix + 1, lex_entry = e })) .ToLookup(a => a.w, a => new NonInitialMwe { index = a.index, lex_entry = a.lex_entry }, text_comparer); if (irregs != null) { irreg_dict = new Dictionary<String, List<Irreg>>(StringComparer.InvariantCultureIgnoreCase); foreach (var iri in irregs) { Entry r; if (!g.tm.entry_dict.TryGetValue(iri.s_rule, out r)) { String msg = String.Format("Rule '{0}' listed as a rule in the irregs file is not a recognized rule.", iri.s_rule); throw new Exception(msg); } Irreg irg = new Irreg(); irg.rule = r as LexicalRule; if (irg.rule == null) { String msg = String.Format("Rule '{0}' listed as a rule in the irregs file is not an inflection rule.", iri.s_rule); throw new Exception(msg); } irg.stem = iri.stem; List<Irreg> lirg; if (!irreg_dict.TryGetValue(iri.inflected, out lirg)) irreg_dict.Add(iri.inflected, lirg = new List<Irreg>()); lirg.Add(irg); } } non_morph_lexrules = g._lexical_rules.OfExactType<LexicalRule>().ToArray(); morph_lexrules = g._lexical_rules.OfType<MorphologicalRule>().ToArray(); } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// [NonSerialized] Grammar g; public IEnumerable<LexicalEntry> lex_entries; [NonSerialized] ILookup<String, LexicalEntry> lex_lookup; public ILookup<String, NonInitialMwe> mwe_lookup; [NonSerialized] public Dictionary<String, List<Irreg>> irreg_dict = null; [NonSerialized] public MorphologicalRule[] morph_lexrules; [NonSerialized] public LexicalRule[] non_morph_lexrules; public IEqualityComparer<String> text_comparer; public IEqualityComparer<String> TextComparer { get { return text_comparer; } } public Grammar Grammar { get { return g; } } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// utility use: echo the input, but add any multi-word lexical entries to the provided list /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// IEnumerable<LexicalEntry> MultiWordFork(IEnumerable<LexicalEntry> input, List<LexicalEntry> mw) { foreach (LexicalEntry le in input) { yield return le; if (le.Lemmata.Count > 1) mw.Add(le); } } public static Lexicon Load(Grammar g, IEnumerable<LexicalEntry> le, BinaryReader br) { BinaryFormatter bf = new BinaryFormatter(); Lexicon l = (Lexicon)bf.Deserialize(br.BaseStream); l.lex_lookup = le.ToLookup(lex => lex.Lemmata[0], l.text_comparer); l.g = g; return l; } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// ILookup(String, LexicalEntry) implementation follows /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// public bool Contains(String key) { return lex_lookup.Contains(key); } public int Count { get { return lex_lookup.Count; } } public IEnumerable<LexicalEntry> this[String key] { get { return lex_lookup[key]; } } public IEnumerator<IGrouping<String, LexicalEntry>> GetEnumerator() { return lex_lookup.GetEnumerator(); } System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator() { return GetEnumerator(); } }; /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// [DebuggerDisplay("{ToString(),nq}")] public partial class LexicalEntry : DemandExpandEntry { public readonly String[] words = null; /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// Note: LexicalEntries are not expanded for the purpose of extracting the orthography. It is obtained from the entry /// Definition, and thus in this design orthography cannot be unified-in as part of a grammar's type expansion. /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// public LexicalEntry(Type t, String name, List<BaseFeatConstraint> bfc) : base(t, name, bfc) { /// Extract the orthography and cache it Edge e; Tfs _def = BootstrapDefinition(); if (tm.config.parser.OrthPath.GetEdge(_def, out e)) { String s = tm.GetStringValue(e.FlagsId); if (s != null) words = new String[] { s }; else words = _def.GetListEdges(e).SelectNotNull(le => tm.GetStringValue(le.FlagsId)).ToArray(); } if (words == null) throw new TfsException("Putative lexical entry '{0}' does not have any orthography at the path '{1}'", Name, tm.config.grammar.orth_path); } public IList<String> Lemmata { get { return words; } } public override string ToString() { return String.Format("{0} {1} {2}", Name.PadRight(20), InstanceType.Name.PadRight(20), words.Select(w => "[" + w + "]").StringJoin(" ")); } }; }