using System; using System.Collections.Generic; using System.Diagnostics; using System.Linq; using miew.Enumerable; using miew.ReadOnly; namespace miew.Tokenization { using String = System.String; /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// A general purpose contiguous span covering one or more "positions" which use zero-based indexing. /// Uses below distinguish character spans (spans of characters in a string) from token spans (spans of tokens /// in a sequence of tokens) /// This structure does not represent empty spans. /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// [DebuggerDisplay("{ToString(),nq}")] public struct Span : IEquatable<Span> { /// <summary> /// Initialize a Span with the specified inclusive start and ending index positions /// </summary> /// <param name="i_start">zero-based inclusive start index position</param> /// <param name="i_end">zero-based inclusive end index position</param> public Span(int i_start, int i_end) { if (i_end < i_start) throw new TypeInitializationException(typeof(Span).ToString(), new ArgumentException("End index must be greater than or equal to start index.")); if (i_start < 0) throw new TypeInitializationException(typeof(Span).ToString(), new ArgumentException("Start index must be non-negative.")); this.i_start = i_start; this.i_end = i_end; } /// <summary> /// Initialize a Span object with the specified staring index position and length /// </summary> /// <param name="i">zero-based starting and ending index position</param> public Span(int i_start, uint length) : this(i_start, i_start + (short)length - 1) { } int i_start, i_end; /// <summary> /// Spans have a minimum length of 1 /// </summary> public int Length { get { return i_end - i_start + 1; } } /// <summary> /// Inclusive zero-based starting index /// </summary> public int StartIndex { get { return i_start; } } /// <summary> /// Inclusive zero-based ending index, could be the same as the start index. /// </summary> public int EndIndex { get { return i_end; } } public override int GetHashCode() { return i_end ^ (i_start << 16); } public override bool Equals(object obj) { return obj is Span && Equals((Span)obj); } public bool Equals(Span other) { return i_start == other.i_start && i_end == other.i_end; } public static bool operator ==(Span x, Span y) { return x.Equals(y); } public static bool operator !=(Span x, Span y) { return !x.Equals(y); } public static Span operator +(Span x, Span y) { if (x.i_end + 1 == y.i_start) return new Span(x.StartIndex, y.EndIndex); if (y.i_end + 1 == x.i_start) return new Span(y.StartIndex, x.EndIndex); throw new InvalidOperationException(); } public bool Overlaps(Span other) { return (other.i_start <= this.i_end && this.i_end <= other.i_end) || (this.i_start <= other.i_end && other.i_end <= this.i_end); } public static Span OuterSpan(params Span[] rgsp) { int i_min = int.MaxValue; int i_max = int.MinValue; foreach (Span sp in rgsp) { if (sp.i_start < i_min) i_min = sp.i_start; if (sp.i_end > i_max) i_max = sp.i_end; } return new Span(i_min, i_max); } /// <summary> /// The range of indexes used by the Span /// </summary> public IEnumerable<int> Range { get { return System.Linq.Enumerable.Range(i_start, Length); } } public override string ToString() { return String.Format("[{0} {1}]", i_start, i_end); } public static IEnumerable<Span> FromIndexList(IEnumerable<int> list) { var ie = list.OrderByDistinct().GetEnumerator(); if (!ie.MoveNext()) yield break; int i_start = ie.Current; int i_prev = i_start; while (ie.MoveNext()) { int i = ie.Current; if (i != i_prev + 1) { yield return new Span(i_start, i_prev); i_start = i; } i_prev = i; } yield return new Span(i_start, i_prev); } }; public interface ICharacterSpan { Span CharacterSpan { get; } String Text { get; } }; public interface ITokenSpan : ICharacterSpan { Span TokenSpan { get; } }; /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// ITokenizers return their results as a set of TokenizerTokens /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// public struct CharacterSpanToken : IEquatable<CharacterSpanToken>, ICharacterSpan { public CharacterSpanToken(int i_start, int i_end) { this.span = new Span(i_start, i_end); this.modified = null; } public CharacterSpanToken(Span span, String modified) { if (modified != null && modified.Length != span.Length) Debug.Print("Warning: The replacement text {0} contains a different number of characters than the span it replaces in the source {1}.", modified, span); this.span = span; this.modified = modified; } public CharacterSpanToken(Span span) { this.span = span; this.modified = null; } /// <summary> /// Character span in the source text /// </summary> Span span; public Span CharacterSpan { get { return span; } } /// <summary> /// A tokenizer can return replacement text which will be associated with the parse token. If this is /// null, the text is retrieved according to the span in the source text. /// </summary> String modified; public String Text { get { return modified; } } /// <summary> /// helper function converts an array of strings to a sequence of abutting character spans. /// </summary> static public IEnumerable<ICharacterSpan> GetStringArraySpans(IEnumerable<String> word_array, int cch_sep) { int i = 0; foreach (String w in word_array) { if (w.Length > 0) yield return new CharacterSpanToken(i, i + w.Length - 1); i += w.Length + cch_sep; } } public bool Equals(CharacterSpanToken other) { return span.Equals(other.span) && modified == other.modified; } public override bool Equals(object obj) { throw new Exception("--> use strongly typed IEquatable<T> instead"); } public override int GetHashCode() { return (modified == null ? 0 : modified.GetHashCode()) ^ span.GetHashCode(); } public override string ToString() { return String.Format("{0} [{1}]", span, modified); } }; /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// [DebuggerDisplay("{ToString(),nq}")] public struct TokenSpanToken : ITokenSpan, IEquatable<TokenSpanToken>, IEnumerable<TokenSpanToken> { public TokenSpanToken(TokenSet ts, String s_subst, Span chr_span, Span tok_span) { this.ts = ts; this.chr_span = chr_span; this.tok_span = tok_span; this.s_subst = s_subst; } readonly TokenSet ts; readonly Span chr_span; readonly Span tok_span; readonly String s_subst; /// <summary> /// The span of characters in the source string that is covered by this token /// </summary> public Span CharacterSpan { get { return chr_span; } } /// <summary> /// The span of minimal token spans (e.g. chart positions) that is covered by this token /// </summary> public Span TokenSpan { get { return tok_span; } } /// <summary> /// The text of this token /// </summary> public String Text { get { return s_subst ?? ts.Source.Text.Substring(chr_span.StartIndex, chr_span.Length); } } /// <summary> /// Owner of this token /// </summary> public TokenSet Owner { get { return ts; } } /// <summary> /// True if this is an initial token in the source string. /// </summary> public bool IsFirst { get { return this.tok_span.StartIndex == 0; } } /// <summary> /// True if this is a final token in the source string. /// </summary> public bool IsLast { get { return this.tok_span.EndIndex == ts.MinimalSpanCount - 1; } } /// <summary> /// Enumerates all tokens in this token's TokenizedString which are strictly right-adjacent to this token. /// </summary> public IEnumerable<TokenSpanToken> RightAbuttingTokens() { int ix = this.tok_span.EndIndex + 1; foreach (TokenSpanToken tok in ts) if (tok.TokenSpan.StartIndex == ix) yield return tok; } /// <summary> /// Enumerates all possible sequences of strictly adjacent tokens to the right of this token /// </summary> public IEnumerable<IEnumerable<TokenSpanToken>> RightAdjacencySequences() { foreach (TokenSpanToken tok in RightAbuttingTokens()) { bool f_any = false; foreach (IEnumerable<TokenSpanToken> adj in tok.RightAdjacencySequences()) { yield return adj.Prepend(tok); f_any = true; } if (!f_any) yield return tok; } } /// <summary> /// Enumerates all tokens in this token's TokenizedString which are strictly left-adjacent to this token. /// </summary> public IEnumerable<TokenSpanToken> LeftAbuttingTokens() { int ix = this.tok_span.StartIndex - 1; foreach (TokenSpanToken tok in ts) if (tok.TokenSpan.EndIndex == ix) yield return tok; } /// <summary> /// Enumerates all possible sequences of strictly adjacent tokens to the left of this token /// </summary> public IEnumerable<IEnumerable<TokenSpanToken>> LeftAdjacencySequences() { foreach (TokenSpanToken tok in LeftAbuttingTokens()) { bool f_any = false; foreach (IEnumerable<TokenSpanToken> adj in tok.LeftAdjacencySequences()) { yield return adj.Append(tok); f_any = true; } if (!f_any) yield return tok; } } public override string ToString() { return String.Format("{0} {1} {2}", CharacterSpan, TokenSpan, Text); } public bool Equals(TokenSpanToken other) { return this.ts == other.ts && this.chr_span == other.chr_span && this.tok_span == other.tok_span; } public override bool Equals(object obj) { return obj is TokenSpanToken && this.Equals((TokenSpanToken)obj); } public override int GetHashCode() { return ts.GetHashCode() ^ (chr_span.GetHashCode() - tok_span.GetHashCode()); } IEnumerator<TokenSpanToken> IEnumerable<TokenSpanToken>.GetEnumerator() { yield return this; } System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator() { yield return this; } }; //public struct Position //{ // public Position(TokenSet ts, int index) // { // this.ts = ts; // this.index = index; // } // public int index; // TokenSet ts; //} /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// Interface for a tokenizer which is able to report multiple arbitrarily overlapping tokenization hypotheses. The /// design currently assumes that each non-overlapping permutation that does not exclude any unblocked tokens is a /// distinct hypothesis. Under such a scheme, it is possible for some hypotheses to not consume some character /// positions which are used in other hypotheses. /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// public interface ITokenizer { IEnumerable<ICharacterSpan> Tokenize(String s_input); }; /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// A simple tokenizer which splits text at the position of every instance of the (specified) characters /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// public class SplitTokenizer : ITokenizer { readonly Char[] sep_chars; public SplitTokenizer(Char[] sep_chars) { this.sep_chars = sep_chars; } public IEnumerable<ICharacterSpan> Tokenize(string s_input) { int ix, ix_prev = 0; while ((ix = s_input.IndexOfAny(sep_chars, ix_prev)) != -1) { if (ix > ix_prev) yield return new CharacterSpanToken(ix_prev, ix - 1); ix_prev = ix + 1; } if (s_input.Length > ix_prev) yield return new CharacterSpanToken(ix_prev, s_input.Length - 1); } }; public class SpaceCharTokenizer : SplitTokenizer { readonly static Char[] one_space = new Char[] { ' ' }; public SpaceCharTokenizer() : base(one_space) { } }; /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// Abstract base class for TokenizedString, which represents a string and its arbitrary collection of tokens, /// and TokenizationHypothesis, which represents a covering tokenization solution. /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// public abstract class TokenSet : IReadOnlyCollection<TokenSpanToken> { protected readonly TokenSpanToken[] tokens; /// <summary> /// bypass the minimal coverage calculation and thus allow recording token selections for a covering subset /// </summary> protected TokenSet(IEnumerable<TokenSpanToken> tokenset_tokens) { this.tokens = tokenset_tokens as TokenSpanToken[] ?? tokenset_tokens.ToArray(); } /// <summary> /// Construct a derived object for the given string and array of character spans. This constructor is /// intended for internal use. /// </summary> protected TokenSet(IEnumerable<ICharacterSpan> tokenizer_tokens) { /// each character span designates a token. Any number of tokens may overlap or leave gaps in the string /// in an arbitrary fashion. Uncovered gaps are ignored by the minimal token span that is computed, making /// them appropriate for representing whitespace. ICharacterSpan[] rg_tok = tokenizer_tokens as ICharacterSpan[] ?? tokenizer_tokens.ToArray(); /// set union of character indexes used by the reported tokens, used to eliminate vacant parts in the next step HashSet<int> hs = rg_tok.UnionMany(t => t.CharacterSpan.Range); /// create a map of source character indexes to minimal token indexes Dictionary<int, int> map = rg_tok .Select(sp => sp.CharacterSpan.StartIndex) // integer character start indexes .Union(rg_tok.Select(sp => sp.CharacterSpan.EndIndex + 1)) // union with starts implied by ends .Sort() // sort all pseudostart indexes .ReduceAdjacent((i_prev, i_cur) => new Span(i_prev, i_cur - 1)) // find minimal parts .Where(sp => hs.Contains(sp.StartIndex)) // eliminate vacant parts .SelectMany((sp, ix_tok) => sp.Range.Select(ix_char => new { ix_char, ix_tok })) // project new indexes .ToDictionary(a => a.ix_char, a => a.ix_tok); /// create tokens with source character span and remapped minimal token span tokens = rg_tok .Select(cs => new TokenSpanToken(this, cs.Text, cs.CharacterSpan, new Span(map[cs.CharacterSpan.StartIndex], map[cs.CharacterSpan.EndIndex]))) .ToArray(); } public abstract TokenizedString Source { get; } public abstract bool IsFullyCovered { get; } /// <summary> /// Returns true if any tokens overlap each other /// </summary> public bool AnyOverlaps { get { if (tokens.Length <= 1) return false; for (int i = 1; i < tokens.Length - 1; i++) if (tokens[i - 1].CharacterSpan.Overlaps(tokens[i].CharacterSpan)) return true; return false; } } /// <summary> /// The set union of all character indexes (in the source string) which are referenced by the tokens /// </summary> public HashSet<int> UsedCharacterIndexes { get { return tokens.UnionMany(t => t.CharacterSpan.Range); } } /// <summary> /// The set union of all token indexes (in the minimal token spans) which are referenced by the /// tokens. By the construction of 'TokenizationString' objects, this set should always be /// uninterestingly equal to the range of integers from 0...MinimalSpanCount-1. /// For 'TokenizationHypothesis' objects, which may have selected a subset of those tokens, /// the range may have gaps which represent missing coverage vis-a-vis the originating /// 'TokenizationString' /// </summary> public HashSet<int> UsedTokenIndexes { get { return tokens.UnionMany(t => t.TokenSpan.Range); } } /// <summary> /// The minimal set of non-overlapping (and possibly disjoint) character spans which accomodate /// the reported source tokens **excluding gaps** /// calculated in constructor now and saved with the tokens /// This function is not intended for main-line use, as it reconstructs the analysis which is now /// performed in the constructor /// </summary> public IEnumerable<Span> MinimalCharacterSpans { get { var hs = UsedCharacterIndexes; return CharacterBoundarySpans.Where(i => hs.Contains(i.StartIndex)); } } /// <summary> /// The minimal set of non-overlapping (and possibly disjoint) character spans which accomodate /// the reported tokens **plus gaps between them** /// This function is not intended for main-line use, as it reconstructs the analysis which is now /// performed in the constructor /// </summary> public IEnumerable<Span> CharacterBoundarySpans { get { return tokens .Select(t => t.CharacterSpan.StartIndex) .Union(tokens.Select(t2 => t2.CharacterSpan.EndIndex + 1)) .Sort() .ReduceAdjacent((i_prev, i_cur) => new Span(i_prev, i_cur - 1)); } } /// <summary> /// Number of spans referenced by the minimal token spans. This is the minimal number of chart positions needed /// to represent all of the string's token boundaries /// </summary> public int MinimalSpanCount { get { return tokens.Max(t => t.TokenSpan.EndIndex) + 1; } } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// IReadOnlyList(Token) implementation /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// public bool Contains(TokenSpanToken item) { return System.Array.IndexOf<TokenSpanToken>(tokens, item) != -1; } public int IndexOf(TokenSpanToken item) { return System.Array.IndexOf<TokenSpanToken>(tokens, item); } public TokenSpanToken this[int ix] { get { return tokens[ix]; } } public void CopyTo(TokenSpanToken[] array, int arrayIndex) { foreach (TokenSpanToken t in this) array[arrayIndex++] = t; } /// <summary> /// This is the number of tokens, not the number of chart positions required for them, even though they are /// frequently equal. The latter is given by MinimalCountSpan. /// </summary> public int Count { get { return tokens.Length; } } public IEnumerator<TokenSpanToken> GetEnumerator() { return tokens.Cast<TokenSpanToken>().GetEnumerator(); } System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator() { return GetEnumerator(); } }; /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// A string which carries information about multiple tokenization hypotheses. Tokens can overlap, and/or /// there can be gaps in the coverage of the surface string. /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// public class TokenizedString : TokenSet { readonly String m_string; static IEnumerable<ICharacterSpan> _prepare_char_spans(String s_input, IEnumerable<ICharacterSpan> iecs) { return iecs.Distinct() .Select(tt => { if (tt.Text == null) { String s_subst = s_input.Substring(tt.CharacterSpan.StartIndex, tt.CharacterSpan.Length); tt = new CharacterSpanToken(tt.CharacterSpan, s_subst); } return tt as ICharacterSpan; }) .OrderBy(sp => sp.CharacterSpan.StartIndex) .ThenBy(sp => sp.CharacterSpan.EndIndex); } public TokenizedString(String s_input, IEnumerable<ICharacterSpan> iecs) : base(_prepare_char_spans(s_input, iecs)) { /// The source string is never modified from its original form but individual tokens can manifest /// substitution text m_string = s_input; } /// <summary> /// Initialize a TokenizedString object by tokenizing the given string with the specified tokenizer /// </summary> public TokenizedString(String s_input, ITokenizer itok) : this(s_input, itok.Tokenize(s_input)) { } /// <summary> /// Initialize a TokenizedString object from an array of strings. The tokens are accepted as-is, but /// the surface representation incorporates the specified string as a seprarator between each string. /// </summary> public TokenizedString(IEnumerable<String> word_array, String separator = null) : base(CharacterSpanToken.GetStringArraySpans(word_array, (separator ?? String.Empty).Length)) { m_string = String.Join(separator, word_array); } /// <summary> /// For when the ISpanToken.Text is available, the source string is built from them /// </summary> /// <param name="i_span_toks"></param> public TokenizedString(IEnumerable<ICharacterSpan> i_span_toks, String separator = null) : base(i_span_toks) { m_string = String.Join(separator, i_span_toks.Select(ist => ist.Text)); } /// <summary> /// The original, unmodified text of the source string. /// </summary> public String Text { get { return m_string; } } public override TokenizedString Source { get { return this; } } /// <summary> /// Returns true if the tokens leave any uncovered gaps in the source string. /// </summary> public bool AnyCharacterGaps { get { return UsedCharacterIndexes.Count != Text.Length; } } /// <summary> /// All objects of type 'TokenizedString' have fully covering token spans by construction, because the /// total coverage of the tokens that are incorporated when the object is built is how the contentful /// regions are permanently defined. Subsets of these tokens--which can manifest partial coverage in /// relation to an originating 'TokenizedString'--are represented with the 'TokenizationHypothesis' /// sibling object. /// </summary> public override bool IsFullyCovered { get { return true; } } /// <summary> /// Permits access to the text of an arbitrary span in the source string. For one thing, this allows one /// to examine the contents of the gap portions, if desired. /// </summary> public String SpanText(Span sp) { return m_string.Substring(sp.StartIndex, sp.Length); } /// <summary> /// This function is not intended for main-line use, as it reconstructs the analysis which is now /// performed in the constructor /// </summary> public Span MinimalSpanToCharacterSpan(Span sp_min) { return new Span(MinimalCharacterSpans.ElementAt(sp_min.StartIndex).StartIndex, MinimalCharacterSpans.ElementAt(sp_min.EndIndex).EndIndex); } public String MinimalSpanText(Span sp_min) { Span sp_chr = MinimalSpanToCharacterSpan(sp_min); return m_string.Substring(sp_chr.StartIndex, sp_chr.Length); } /// <summary> /// Returns those token sequences which fully cover the minimal token spans, if any, as individual single- /// coverage hypotheses. Naturally some of these distinct sequences may re-use the same tokens; what this /// function tries to guarantee is that each returned solution has no overlap. If one wishes to allow the /// parser (for example) to take control of token selection, then one can just process the TokenizedString /// to the parser instead, since such tools are expected to rely only on the common base class 'TokenSet' /// </summary> public IEnumerable<TokenizationHypothesis> Hypotheses { get { if (tokens.Length == 0) yield break; int ix_last = MinimalSpanCount - 1; foreach (var s in tokens[0].RightAdjacencySequences().Where(seq => seq.Last().TokenSpan.EndIndex == ix_last)) yield return new TokenizationHypothesis(this, s); } } public override string ToString() { return this.Select(tok => String.Format("{{{0}}}{1}", tok.Text, tok.CharacterSpan.ToString())).StringJoin(" "); } }; /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /// <summary> /// As discussed above, TokenizedString has the ability to winnow all single coverage tokenization solutions from /// a set of arbitrarily covering tokens (if any). These solutions are kept in this separate wrapper in order to /// record their provenance. /// </summary> /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// public class TokenizationHypothesis : TokenSet { TokenizedString ts; public TokenizationHypothesis(TokenizedString ts, IEnumerable<TokenSpanToken> tokens) : base(tokens) { this.ts = ts; } public override TokenizedString Source { get { return ts; } } /// <summary> /// Returns true if this hypothesis fully covers all minimal span positions of the tokenization /// superset (i.e. 'TokenizedString') from which this TokenizationHypothesis was drawn. /// </summary> public override bool IsFullyCovered { get { //return this.UsedTokenIndexes.Count == MinimalSpanCount - 1; return this.UsedCharacterIndexes.Count == ts.UsedCharacterIndexes.Count; } } }; #if false public class TestTokenizer2 : ITokenizer { public IEnumerable<TokenizerToken> Tokenize(string s_input) { yield return new TokenizerToken(7, 9); yield return new TokenizerToken(2, 4); yield return new TokenizerToken(2, 6); yield return new TokenizerToken(5, 5); yield return new TokenizerToken(6, 6); yield return new TokenizerToken(7, 10); yield return new TokenizerToken(7, 8); yield return new TokenizerToken(8, 8); yield return new TokenizerToken(9, 9); yield return new TokenizerToken(1, 1); yield return new TokenizerToken(0, 3); yield return new TokenizerToken(9, 11); yield return new TokenizerToken(4, 4); } } static void rpt(TokenizedString ts) { Console.WriteLine(@"input: [{0}] Tokens: {1} any gaps: {2} any overlaps: {3} Minimal Span Count: {4} Minimal Character Spans: {5} Minimal Spans Including Vacant: {6} ", ts.Text, ts.Select(t => { String ttt = new String(' ', t.CharacterSpan.StartIndex) + t.Text + new String(' ', ts.Text.Length - t.CharacterSpan.EndIndex - 1); return String.Format("\t{0}\t{1}\t[{2}]", t.CharacterSpan, t.TokenSpan, ttt); }).StringJoin(Environment.NewLine), ts.AnyCharacterGaps, ts.AnyOverlaps, ts.MinimalSpanCount, ts.MinimalCharacterSpans.Select(t => String.Format("\t{0}\t\\{1}\\", t, ts.SpanText(t))).StringJoin(Environment.NewLine), ts.CharacterBoundarySpans.Select(t => String.Format("\t{0}\t\\{1}\\", t, ts.SpanText(t))).StringJoin(Environment.NewLine)); Console.WriteLine(); } #endif }