using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using miew.Enumerable;
using miew.ReadOnly;
namespace miew.Tokenization
{
using String = System.String;
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
/// A general purpose contiguous span covering one or more "positions" which use zero-based indexing.
/// Uses below distinguish character spans (spans of characters in a string) from token spans (spans of tokens
/// in a sequence of tokens)
/// This structure does not represent empty spans.
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
[DebuggerDisplay("{ToString(),nq}")]
public struct Span : IEquatable<Span>
{
/// <summary>
/// Initialize a Span with the specified inclusive start and ending index positions
/// </summary>
/// <param name="i_start">zero-based inclusive start index position</param>
/// <param name="i_end">zero-based inclusive end index position</param>
public Span(int i_start, int i_end)
{
if (i_end < i_start)
throw new TypeInitializationException(typeof(Span).ToString(), new ArgumentException("End index must be greater than or equal to start index."));
if (i_start < 0)
throw new TypeInitializationException(typeof(Span).ToString(), new ArgumentException("Start index must be non-negative."));
this.i_start = i_start;
this.i_end = i_end;
}
/// <summary>
/// Initialize a Span object with the specified staring index position and length
/// </summary>
/// <param name="i">zero-based starting and ending index position</param>
public Span(int i_start, uint length)
: this(i_start, i_start + (short)length - 1)
{
}
int i_start, i_end;
/// <summary>
/// Spans have a minimum length of 1
/// </summary>
public int Length { get { return i_end - i_start + 1; } }
/// <summary>
/// Inclusive zero-based starting index
/// </summary>
public int StartIndex { get { return i_start; } }
/// <summary>
/// Inclusive zero-based ending index, could be the same as the start index.
/// </summary>
public int EndIndex { get { return i_end; } }
public override int GetHashCode() { return i_end ^ (i_start << 16); }
public override bool Equals(object obj)
{
return obj is Span && Equals((Span)obj);
}
public bool Equals(Span other)
{
return i_start == other.i_start && i_end == other.i_end;
}
public static bool operator ==(Span x, Span y) { return x.Equals(y); }
public static bool operator !=(Span x, Span y) { return !x.Equals(y); }
public static Span operator +(Span x, Span y)
{
if (x.i_end + 1 == y.i_start)
return new Span(x.StartIndex, y.EndIndex);
if (y.i_end + 1 == x.i_start)
return new Span(y.StartIndex, x.EndIndex);
throw new InvalidOperationException();
}
public bool Overlaps(Span other)
{
return (other.i_start <= this.i_end && this.i_end <= other.i_end) ||
(this.i_start <= other.i_end && other.i_end <= this.i_end);
}
public static Span OuterSpan(params Span[] rgsp)
{
int i_min = int.MaxValue;
int i_max = int.MinValue;
foreach (Span sp in rgsp)
{
if (sp.i_start < i_min)
i_min = sp.i_start;
if (sp.i_end > i_max)
i_max = sp.i_end;
}
return new Span(i_min, i_max);
}
/// <summary>
/// The range of indexes used by the Span
/// </summary>
public IEnumerable<int> Range
{
get { return System.Linq.Enumerable.Range(i_start, Length); }
}
public override string ToString()
{
return String.Format("[{0} {1}]", i_start, i_end);
}
public static IEnumerable<Span> FromIndexList(IEnumerable<int> list)
{
var ie = list.OrderByDistinct().GetEnumerator();
if (!ie.MoveNext())
yield break;
int i_start = ie.Current;
int i_prev = i_start;
while (ie.MoveNext())
{
int i = ie.Current;
if (i != i_prev + 1)
{
yield return new Span(i_start, i_prev);
i_start = i;
}
i_prev = i;
}
yield return new Span(i_start, i_prev);
}
};
public interface ICharacterSpan
{
Span CharacterSpan { get; }
String Text { get; }
};
public interface ITokenSpan : ICharacterSpan
{
Span TokenSpan { get; }
};
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
/// ITokenizers return their results as a set of TokenizerTokens
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
public struct CharacterSpanToken : IEquatable<CharacterSpanToken>, ICharacterSpan
{
public CharacterSpanToken(int i_start, int i_end)
{
this.span = new Span(i_start, i_end);
this.modified = null;
}
public CharacterSpanToken(Span span, String modified)
{
if (modified != null && modified.Length != span.Length)
Debug.Print("Warning: The replacement text {0} contains a different number of characters than the span it replaces in the source {1}.", modified, span);
this.span = span;
this.modified = modified;
}
public CharacterSpanToken(Span span)
{
this.span = span;
this.modified = null;
}
/// <summary>
/// Character span in the source text
/// </summary>
Span span;
public Span CharacterSpan { get { return span; } }
/// <summary>
/// A tokenizer can return replacement text which will be associated with the parse token. If this is
/// null, the text is retrieved according to the span in the source text.
/// </summary>
String modified;
public String Text { get { return modified; } }
/// <summary>
/// helper function converts an array of strings to a sequence of abutting character spans.
/// </summary>
static public IEnumerable<ICharacterSpan> GetStringArraySpans(IEnumerable<String> word_array, int cch_sep)
{
int i = 0;
foreach (String w in word_array)
{
if (w.Length > 0)
yield return new CharacterSpanToken(i, i + w.Length - 1);
i += w.Length + cch_sep;
}
}
public bool Equals(CharacterSpanToken other)
{
return span.Equals(other.span) && modified == other.modified;
}
public override bool Equals(object obj)
{
throw new Exception("--> use strongly typed IEquatable<T> instead");
}
public override int GetHashCode()
{
return (modified == null ? 0 : modified.GetHashCode()) ^ span.GetHashCode();
}
public override string ToString()
{
return String.Format("{0} [{1}]", span, modified);
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
///
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
[DebuggerDisplay("{ToString(),nq}")]
public struct TokenSpanToken : ITokenSpan, IEquatable<TokenSpanToken>, IEnumerable<TokenSpanToken>
{
public TokenSpanToken(TokenSet ts, String s_subst, Span chr_span, Span tok_span)
{
this.ts = ts;
this.chr_span = chr_span;
this.tok_span = tok_span;
this.s_subst = s_subst;
}
readonly TokenSet ts;
readonly Span chr_span;
readonly Span tok_span;
readonly String s_subst;
/// <summary>
/// The span of characters in the source string that is covered by this token
/// </summary>
public Span CharacterSpan
{
get { return chr_span; }
}
/// <summary>
/// The span of minimal token spans (e.g. chart positions) that is covered by this token
/// </summary>
public Span TokenSpan
{
get { return tok_span; }
}
/// <summary>
/// The text of this token
/// </summary>
public String Text
{
get { return s_subst ?? ts.Source.Text.Substring(chr_span.StartIndex, chr_span.Length); }
}
/// <summary>
/// Owner of this token
/// </summary>
public TokenSet Owner { get { return ts; } }
/// <summary>
/// True if this is an initial token in the source string.
/// </summary>
public bool IsFirst { get { return this.tok_span.StartIndex == 0; } }
/// <summary>
/// True if this is a final token in the source string.
/// </summary>
public bool IsLast { get { return this.tok_span.EndIndex == ts.MinimalSpanCount - 1; } }
/// <summary>
/// Enumerates all tokens in this token's TokenizedString which are strictly right-adjacent to this token.
/// </summary>
public IEnumerable<TokenSpanToken> RightAbuttingTokens()
{
int ix = this.tok_span.EndIndex + 1;
foreach (TokenSpanToken tok in ts)
if (tok.TokenSpan.StartIndex == ix)
yield return tok;
}
/// <summary>
/// Enumerates all possible sequences of strictly adjacent tokens to the right of this token
/// </summary>
public IEnumerable<IEnumerable<TokenSpanToken>> RightAdjacencySequences()
{
foreach (TokenSpanToken tok in RightAbuttingTokens())
{
bool f_any = false;
foreach (IEnumerable<TokenSpanToken> adj in tok.RightAdjacencySequences())
{
yield return adj.Prepend(tok);
f_any = true;
}
if (!f_any)
yield return tok;
}
}
/// <summary>
/// Enumerates all tokens in this token's TokenizedString which are strictly left-adjacent to this token.
/// </summary>
public IEnumerable<TokenSpanToken> LeftAbuttingTokens()
{
int ix = this.tok_span.StartIndex - 1;
foreach (TokenSpanToken tok in ts)
if (tok.TokenSpan.EndIndex == ix)
yield return tok;
}
/// <summary>
/// Enumerates all possible sequences of strictly adjacent tokens to the left of this token
/// </summary>
public IEnumerable<IEnumerable<TokenSpanToken>> LeftAdjacencySequences()
{
foreach (TokenSpanToken tok in LeftAbuttingTokens())
{
bool f_any = false;
foreach (IEnumerable<TokenSpanToken> adj in tok.LeftAdjacencySequences())
{
yield return adj.Append(tok);
f_any = true;
}
if (!f_any)
yield return tok;
}
}
public override string ToString()
{
return String.Format("{0} {1} {2}", CharacterSpan, TokenSpan, Text);
}
public bool Equals(TokenSpanToken other)
{
return this.ts == other.ts && this.chr_span == other.chr_span && this.tok_span == other.tok_span;
}
public override bool Equals(object obj)
{
return obj is TokenSpanToken && this.Equals((TokenSpanToken)obj);
}
public override int GetHashCode()
{
return ts.GetHashCode() ^ (chr_span.GetHashCode() - tok_span.GetHashCode());
}
IEnumerator<TokenSpanToken> IEnumerable<TokenSpanToken>.GetEnumerator() { yield return this; }
System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator() { yield return this; }
};
//public struct Position
//{
// public Position(TokenSet ts, int index)
// {
// this.ts = ts;
// this.index = index;
// }
// public int index;
// TokenSet ts;
//}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
/// Interface for a tokenizer which is able to report multiple arbitrarily overlapping tokenization hypotheses. The
/// design currently assumes that each non-overlapping permutation that does not exclude any unblocked tokens is a
/// distinct hypothesis. Under such a scheme, it is possible for some hypotheses to not consume some character
/// positions which are used in other hypotheses.
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
public interface ITokenizer
{
IEnumerable<ICharacterSpan> Tokenize(String s_input);
};
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
/// A simple tokenizer which splits text at the position of every instance of the (specified) characters
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
public class SplitTokenizer : ITokenizer
{
readonly Char[] sep_chars;
public SplitTokenizer(Char[] sep_chars)
{
this.sep_chars = sep_chars;
}
public IEnumerable<ICharacterSpan> Tokenize(string s_input)
{
int ix, ix_prev = 0;
while ((ix = s_input.IndexOfAny(sep_chars, ix_prev)) != -1)
{
if (ix > ix_prev)
yield return new CharacterSpanToken(ix_prev, ix - 1);
ix_prev = ix + 1;
}
if (s_input.Length > ix_prev)
yield return new CharacterSpanToken(ix_prev, s_input.Length - 1);
}
};
public class SpaceCharTokenizer : SplitTokenizer
{
readonly static Char[] one_space = new Char[] { ' ' };
public SpaceCharTokenizer()
: base(one_space)
{
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
/// Abstract base class for TokenizedString, which represents a string and its arbitrary collection of tokens,
/// and TokenizationHypothesis, which represents a covering tokenization solution.
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
public abstract class TokenSet : IReadOnlyCollection<TokenSpanToken>
{
protected readonly TokenSpanToken[] tokens;
/// <summary>
/// bypass the minimal coverage calculation and thus allow recording token selections for a covering subset
/// </summary>
protected TokenSet(IEnumerable<TokenSpanToken> tokenset_tokens)
{
this.tokens = tokenset_tokens as TokenSpanToken[] ?? tokenset_tokens.ToArray();
}
/// <summary>
/// Construct a derived object for the given string and array of character spans. This constructor is
/// intended for internal use.
/// </summary>
protected TokenSet(IEnumerable<ICharacterSpan> tokenizer_tokens)
{
/// each character span designates a token. Any number of tokens may overlap or leave gaps in the string
/// in an arbitrary fashion. Uncovered gaps are ignored by the minimal token span that is computed, making
/// them appropriate for representing whitespace.
ICharacterSpan[] rg_tok = tokenizer_tokens as ICharacterSpan[] ?? tokenizer_tokens.ToArray();
/// set union of character indexes used by the reported tokens, used to eliminate vacant parts in the next step
HashSet<int> hs = rg_tok.UnionMany(t => t.CharacterSpan.Range);
/// create a map of source character indexes to minimal token indexes
Dictionary<int, int> map = rg_tok
.Select(sp => sp.CharacterSpan.StartIndex) // integer character start indexes
.Union(rg_tok.Select(sp => sp.CharacterSpan.EndIndex + 1)) // union with starts implied by ends
.Sort() // sort all pseudostart indexes
.ReduceAdjacent((i_prev, i_cur) => new Span(i_prev, i_cur - 1)) // find minimal parts
.Where(sp => hs.Contains(sp.StartIndex)) // eliminate vacant parts
.SelectMany((sp, ix_tok) => sp.Range.Select(ix_char => new { ix_char, ix_tok })) // project new indexes
.ToDictionary(a => a.ix_char, a => a.ix_tok);
/// create tokens with source character span and remapped minimal token span
tokens = rg_tok
.Select(cs => new TokenSpanToken(this, cs.Text, cs.CharacterSpan, new Span(map[cs.CharacterSpan.StartIndex], map[cs.CharacterSpan.EndIndex])))
.ToArray();
}
public abstract TokenizedString Source { get; }
public abstract bool IsFullyCovered { get; }
/// <summary>
/// Returns true if any tokens overlap each other
/// </summary>
public bool AnyOverlaps
{
get
{
if (tokens.Length <= 1)
return false;
for (int i = 1; i < tokens.Length - 1; i++)
if (tokens[i - 1].CharacterSpan.Overlaps(tokens[i].CharacterSpan))
return true;
return false;
}
}
/// <summary>
/// The set union of all character indexes (in the source string) which are referenced by the tokens
/// </summary>
public HashSet<int> UsedCharacterIndexes
{
get { return tokens.UnionMany(t => t.CharacterSpan.Range); }
}
/// <summary>
/// The set union of all token indexes (in the minimal token spans) which are referenced by the
/// tokens. By the construction of 'TokenizationString' objects, this set should always be
/// uninterestingly equal to the range of integers from 0...MinimalSpanCount-1.
/// For 'TokenizationHypothesis' objects, which may have selected a subset of those tokens,
/// the range may have gaps which represent missing coverage vis-a-vis the originating
/// 'TokenizationString'
/// </summary>
public HashSet<int> UsedTokenIndexes
{
get { return tokens.UnionMany(t => t.TokenSpan.Range); }
}
/// <summary>
/// The minimal set of non-overlapping (and possibly disjoint) character spans which accomodate
/// the reported source tokens **excluding gaps**
/// calculated in constructor now and saved with the tokens
/// This function is not intended for main-line use, as it reconstructs the analysis which is now
/// performed in the constructor
/// </summary>
public IEnumerable<Span> MinimalCharacterSpans
{
get
{
var hs = UsedCharacterIndexes;
return CharacterBoundarySpans.Where(i => hs.Contains(i.StartIndex));
}
}
/// <summary>
/// The minimal set of non-overlapping (and possibly disjoint) character spans which accomodate
/// the reported tokens **plus gaps between them**
/// This function is not intended for main-line use, as it reconstructs the analysis which is now
/// performed in the constructor
/// </summary>
public IEnumerable<Span> CharacterBoundarySpans
{
get
{
return tokens
.Select(t => t.CharacterSpan.StartIndex)
.Union(tokens.Select(t2 => t2.CharacterSpan.EndIndex + 1))
.Sort()
.ReduceAdjacent((i_prev, i_cur) => new Span(i_prev, i_cur - 1));
}
}
/// <summary>
/// Number of spans referenced by the minimal token spans. This is the minimal number of chart positions needed
/// to represent all of the string's token boundaries
/// </summary>
public int MinimalSpanCount { get { return tokens.Max(t => t.TokenSpan.EndIndex) + 1; } }
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
/// IReadOnlyList(Token) implementation
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
public bool Contains(TokenSpanToken item)
{
return System.Array.IndexOf<TokenSpanToken>(tokens, item) != -1;
}
public int IndexOf(TokenSpanToken item)
{
return System.Array.IndexOf<TokenSpanToken>(tokens, item);
}
public TokenSpanToken this[int ix]
{
get { return tokens[ix]; }
}
public void CopyTo(TokenSpanToken[] array, int arrayIndex)
{
foreach (TokenSpanToken t in this)
array[arrayIndex++] = t;
}
/// <summary>
/// This is the number of tokens, not the number of chart positions required for them, even though they are
/// frequently equal. The latter is given by MinimalCountSpan.
/// </summary>
public int Count { get { return tokens.Length; } }
public IEnumerator<TokenSpanToken> GetEnumerator()
{
return tokens.Cast<TokenSpanToken>().GetEnumerator();
}
System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator()
{
return GetEnumerator();
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
/// A string which carries information about multiple tokenization hypotheses. Tokens can overlap, and/or
/// there can be gaps in the coverage of the surface string.
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
public class TokenizedString : TokenSet
{
readonly String m_string;
static IEnumerable<ICharacterSpan> _prepare_char_spans(String s_input, IEnumerable<ICharacterSpan> iecs)
{
return iecs.Distinct()
.Select(tt =>
{
if (tt.Text == null)
{
String s_subst = s_input.Substring(tt.CharacterSpan.StartIndex, tt.CharacterSpan.Length);
tt = new CharacterSpanToken(tt.CharacterSpan, s_subst);
}
return tt as ICharacterSpan;
})
.OrderBy(sp => sp.CharacterSpan.StartIndex)
.ThenBy(sp => sp.CharacterSpan.EndIndex);
}
public TokenizedString(String s_input, IEnumerable<ICharacterSpan> iecs)
: base(_prepare_char_spans(s_input, iecs))
{
/// The source string is never modified from its original form but individual tokens can manifest
/// substitution text
m_string = s_input;
}
/// <summary>
/// Initialize a TokenizedString object by tokenizing the given string with the specified tokenizer
/// </summary>
public TokenizedString(String s_input, ITokenizer itok)
: this(s_input, itok.Tokenize(s_input))
{
}
/// <summary>
/// Initialize a TokenizedString object from an array of strings. The tokens are accepted as-is, but
/// the surface representation incorporates the specified string as a seprarator between each string.
/// </summary>
public TokenizedString(IEnumerable<String> word_array, String separator = null)
: base(CharacterSpanToken.GetStringArraySpans(word_array, (separator ?? String.Empty).Length))
{
m_string = String.Join(separator, word_array);
}
/// <summary>
/// For when the ISpanToken.Text is available, the source string is built from them
/// </summary>
/// <param name="i_span_toks"></param>
public TokenizedString(IEnumerable<ICharacterSpan> i_span_toks, String separator = null)
: base(i_span_toks)
{
m_string = String.Join(separator, i_span_toks.Select(ist => ist.Text));
}
/// <summary>
/// The original, unmodified text of the source string.
/// </summary>
public String Text { get { return m_string; } }
public override TokenizedString Source { get { return this; } }
/// <summary>
/// Returns true if the tokens leave any uncovered gaps in the source string.
/// </summary>
public bool AnyCharacterGaps
{
get { return UsedCharacterIndexes.Count != Text.Length; }
}
/// <summary>
/// All objects of type 'TokenizedString' have fully covering token spans by construction, because the
/// total coverage of the tokens that are incorporated when the object is built is how the contentful
/// regions are permanently defined. Subsets of these tokens--which can manifest partial coverage in
/// relation to an originating 'TokenizedString'--are represented with the 'TokenizationHypothesis'
/// sibling object.
/// </summary>
public override bool IsFullyCovered
{
get { return true; }
}
/// <summary>
/// Permits access to the text of an arbitrary span in the source string. For one thing, this allows one
/// to examine the contents of the gap portions, if desired.
/// </summary>
public String SpanText(Span sp)
{
return m_string.Substring(sp.StartIndex, sp.Length);
}
/// <summary>
/// This function is not intended for main-line use, as it reconstructs the analysis which is now
/// performed in the constructor
/// </summary>
public Span MinimalSpanToCharacterSpan(Span sp_min)
{
return new Span(MinimalCharacterSpans.ElementAt(sp_min.StartIndex).StartIndex,
MinimalCharacterSpans.ElementAt(sp_min.EndIndex).EndIndex);
}
public String MinimalSpanText(Span sp_min)
{
Span sp_chr = MinimalSpanToCharacterSpan(sp_min);
return m_string.Substring(sp_chr.StartIndex, sp_chr.Length);
}
/// <summary>
/// Returns those token sequences which fully cover the minimal token spans, if any, as individual single-
/// coverage hypotheses. Naturally some of these distinct sequences may re-use the same tokens; what this
/// function tries to guarantee is that each returned solution has no overlap. If one wishes to allow the
/// parser (for example) to take control of token selection, then one can just process the TokenizedString
/// to the parser instead, since such tools are expected to rely only on the common base class 'TokenSet'
/// </summary>
public IEnumerable<TokenizationHypothesis> Hypotheses
{
get
{
if (tokens.Length == 0)
yield break;
int ix_last = MinimalSpanCount - 1;
foreach (var s in tokens[0].RightAdjacencySequences().Where(seq => seq.Last().TokenSpan.EndIndex == ix_last))
yield return new TokenizationHypothesis(this, s);
}
}
public override string ToString()
{
return this.Select(tok => String.Format("{{{0}}}{1}", tok.Text, tok.CharacterSpan.ToString())).StringJoin(" ");
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// <summary>
/// As discussed above, TokenizedString has the ability to winnow all single coverage tokenization solutions from
/// a set of arbitrarily covering tokens (if any). These solutions are kept in this separate wrapper in order to
/// record their provenance.
/// </summary>
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
public class TokenizationHypothesis : TokenSet
{
TokenizedString ts;
public TokenizationHypothesis(TokenizedString ts, IEnumerable<TokenSpanToken> tokens)
: base(tokens)
{
this.ts = ts;
}
public override TokenizedString Source
{
get { return ts; }
}
/// <summary>
/// Returns true if this hypothesis fully covers all minimal span positions of the tokenization
/// superset (i.e. 'TokenizedString') from which this TokenizationHypothesis was drawn.
/// </summary>
public override bool IsFullyCovered
{
get
{
//return this.UsedTokenIndexes.Count == MinimalSpanCount - 1;
return this.UsedCharacterIndexes.Count == ts.UsedCharacterIndexes.Count;
}
}
};
#if false
public class TestTokenizer2 : ITokenizer
{
public IEnumerable<TokenizerToken> Tokenize(string s_input)
{
yield return new TokenizerToken(7, 9);
yield return new TokenizerToken(2, 4);
yield return new TokenizerToken(2, 6);
yield return new TokenizerToken(5, 5);
yield return new TokenizerToken(6, 6);
yield return new TokenizerToken(7, 10);
yield return new TokenizerToken(7, 8);
yield return new TokenizerToken(8, 8);
yield return new TokenizerToken(9, 9);
yield return new TokenizerToken(1, 1);
yield return new TokenizerToken(0, 3);
yield return new TokenizerToken(9, 11);
yield return new TokenizerToken(4, 4);
}
}
static void rpt(TokenizedString ts)
{
Console.WriteLine(@"input: [{0}]
Tokens:
{1}
any gaps: {2}
any overlaps: {3}
Minimal Span Count: {4}
Minimal Character Spans:
{5}
Minimal Spans Including Vacant:
{6}
",
ts.Text,
ts.Select(t =>
{
String ttt = new String(' ', t.CharacterSpan.StartIndex) + t.Text + new String(' ', ts.Text.Length - t.CharacterSpan.EndIndex - 1);
return String.Format("\t{0}\t{1}\t[{2}]", t.CharacterSpan, t.TokenSpan, ttt);
}).StringJoin(Environment.NewLine),
ts.AnyCharacterGaps,
ts.AnyOverlaps,
ts.MinimalSpanCount,
ts.MinimalCharacterSpans.Select(t => String.Format("\t{0}\t\\{1}\\", t, ts.SpanText(t))).StringJoin(Environment.NewLine),
ts.CharacterBoundarySpans.Select(t => String.Format("\t{0}\t\\{1}\\", t, ts.SpanText(t))).StringJoin(Environment.NewLine));
Console.WriteLine();
}
#endif
}