using System;
using System.Globalization;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading;

using miew.Tokenization;

namespace erg_funcs
{
	///	the following temporary expedient attempts to get capitalization more right
	///	than we used to do in generator outputs.  still, for acronyms like `IBM' or
	///	complex names including lower case elements, i see no alternative to using
	///	ORTH to spell out the actual (canonical) surface form.  that would seem to
	///	require that we re-view assumptions about capitalization across the lexicon
	///	et al.  but the LKB should probably do that one day!        (30-aug-05; oe)
	///	--- as of late, the ERG lexicon actually contains (some) ORTH values that
	///	reflect canonical capitalization; the modified code below will now try to
	///	either (a) respect the orthography from the lexicon, as long as it contains
	///	at least one upper-case letter and is string-equal() to the inflected form
	///	(which tends to be true for proper names at least :-) or (b) invoke the old
	///	heuristics to try and guess appropriate capitalization.  still not quite a
	///	perfect solution, but to do better i now think the morphology would have to
	///	stop upcasing things as soon as one of the inflectional rules applies.
	///	                                                            (18-dec-06; oe)
	///	
	///	(defun gen-extract-surface (edge &optional (initialp t) &key cliticp stream)
	///	  (if stream
	///	    (let ((daughters (edge-children edge)))
	///	      (if daughters
	///	        (loop
	///	            for daughter in daughters
	///	            for foo = initialp then nil
	///	            do 
	///	              (setf cliticp 
	///	                (gen-extract-surface
	///	                 daughter foo :cliticp cliticp :stream stream))
	///	              #+:logon finally
	///	              #+:logon
	///	              (setf (edge-lnk edge)
	///	                (mrs::combine-lnks
	///	                 (edge-lnk (first daughters))
	///	                 (edge-lnk (first (last daughters))))))
	///	        (let* ((entry (get-lex-entry-from-id (first (edge-lex-ids edge))))
	///	               (orth (format nil "~{~a~^ ~}" (lex-entry-orth entry)))
	///
	///	               ;; need to fix-up irregular cases like `Englishmen' manually :-{
	///
	///	               (orth (if (ppcre::scan "man$" orth)
	///	                       (subseq orth 0 (- (length orth) 3))
	///	                       orth))
	///	               (tdfs (and entry (lex-entry-full-fs entry)))
	///	               (type (and tdfs (type-of-fs (tdfs-indef tdfs))))
	///	               (string (string-downcase (copy-seq (first (edge-leaves edge)))))
	///
	///	               ;; _fix_me_
	///	               ;; maybe we could be more courageous and just search for .orth.
	///	               ;; as a sub-sequence of .string., starting at position .prefix.
	///	               ;;                                               (22-dec-06; oe)
	///	               
	///	               (prefix (loop
	///	                           for c across string
	///	                           while (member c '(#\( #\" #\') :test #'char=)
	///	                           count 1))
	///	               (suffix (min (length string) (+ prefix (length orth))))
	///	               (suffix (when (string-equal
	///	                              orth string :start2 prefix :end2 suffix)
	///	                         suffix))
	///	               (rawp (and suffix
	///	                          (loop for c across orth thereis (upper-case-p c))))
	///	               (capitalizep
	///	                (ignore-errors
	///	                 (loop
	///	                     for match in '(basic_n_proper_lexent
	///	                                    n_-_c-month_le
	///	                                    n_-_c-dow_le
	///	                                    n_-_pr-i_le)
	///	                     thereis (or (eq type match)
	///	                                 (subtype-p type match)))))
	///	               (cliticp (or cliticp
	///	                            (and (> (length string) 0)
	///	                                 (char= (char string 0) #\')))))
	///	          (if rawp
	///	            (setf string
	///	              (concatenate 'string
	///	                (subseq string 0 prefix) orth (subseq string suffix)))
	///	            (when capitalizep
	///	              (loop
	///	                  with spacep = t
	///	                  for i from 0 to (- (length string) 1)
	///	                  for c = (schar string i)
	///	                  when (char= c #\Space) do (setf spacep t)
	///	                  else when (char= c #\_)
	///	                  do
	///	                    (setf spacep t)
	///	                    (setf (schar string i) #\Space)
	///	                  else do
	///	                    (when (and spacep (alphanumericp c))
	///	                      (setf (schar string i) (char-upcase c)))
	///	                       (setf spacep nil))))
	///	          (when (and (> (length string) 1)
	///	                     (char= (char string 0) #\_)
	///	                     (upper-case-p (char string 1)))
	///	            (setf string (subseq string 1)))
	///	          (when (and initialp (alphanumericp (schar string 0)))
	///	            (setf (schar string 0) (char-upcase (schar string 0))))
	///	          (unless (or initialp cliticp)
	///	            (format stream " "))
	///	          (let (#+:logon 
	///	                (start (file-position stream)))
	///	            (loop
	///	                with hyphenp
	///	                for c across string
	///	                unless (and hyphenp (char= c #\space))
	///	                do (write-char c stream)
	///	                when (char= c #\-) do (setf hyphenp t)
	///	                else do (setf hyphenp nil))
	///	            #+:logon
	///	            (setf (edge-lnk edge)
	///	              (list :characters start (file-position stream))))
	///
	///	          ;; finally, inform the caller as to whether we output something that
	///	          ;; inhibits intervening space (e.g. `mid-July').
	///
	///	          (unless (string= orth "")
	///	            (member (schar orth (- (length orth) 1)) '(#\-) :test #'char=)))))
	///	    (let ((stream (make-string-output-stream)))
	///	      (gen-extract-surface edge initialp :stream stream)
	///	      (get-output-stream-string stream))))

	public class ErgTokenizer : ITokenizer
	{
		Func<String, bool> check_lexicon;

		public ErgTokenizer(Func<String, bool> check_lexicon)
		{
			this.check_lexicon = check_lexicon;
		}

		readonly static Char[] one_space = new Char[] { ' ' };
		readonly static Char[] punct = new Char[] { '.', ',', '!', '(', ')', '?' };
		readonly static String[] some_affixes = { "s", "ing", "ed" };
		readonly static String[] some_clitics = { "'s", "'ll", "'m", "'t", "'re", "'d", "'ve", "'bout", "'clock", "'kay", "'em", "'all" };

		/// <summary>
		/// First of all, just break on spaces
		/// </summary>
		IEnumerable<Span> inner_tokenize(String s_input)
		{
			int ix, ix_prev = 0;
			while ((ix = s_input.IndexOfAny(one_space, ix_prev)) != -1)
			{
				if (ix > ix_prev)
					yield return new Span(ix_prev, ix - 1);
				ix_prev = ix + 1;
			}
			if (s_input.Length > ix_prev)
				yield return new Span(ix_prev, s_input.Length - 1);
		}

		///// <summary>
		///// Check the tokens to see if some altered version should be submitted instead
		///// </summary>
		public IEnumerable<ICharacterSpan> Tokenize(string s_input)
		{
			foreach (Span sp in inner_tokenize(s_input))
			{
				String tok = s_input.Substring(sp.StartIndex, sp.Length);
				if (check_lexicon(tok))
				{
					yield return new CharacterSpanToken(sp, tok);
					continue;
				}

				/// trim start and end punctuation and remember how much we took from each
				String stem = null;
				String lookup_tok = tok.Trim(punct);
				if (some_clitics.Any(clitic => lookup_tok.EndsWith(clitic) &&
					check_lexicon(stem = lookup_tok.Remove(lookup_tok.Length - clitic.Length))))
				{
					int ei = sp.StartIndex + stem.Length - 1;
					yield return new CharacterSpanToken(new Span(sp.StartIndex, ei), stem);
					yield return new CharacterSpanToken(ei + 1, sp.EndIndex);
					continue;
				}

				/// "snow-clearing"  ->  |snow-|  |clearing|
				String[] parts = lookup_tok.Split('-');
				if (parts.Length == 2)
				{
					if (check_lexicon(parts[0]) && check_lexicon(parts[1]))
					{
						int ix = sp.StartIndex + parts[0].Length;
						yield return new CharacterSpanToken(sp.StartIndex, ix);
						yield return new CharacterSpanToken(ix + 1, sp.EndIndex);
						continue;
					}

					String no_hyphen = parts[0] + parts[1];
					if (check_lexicon(no_hyphen))
					{
						yield return new CharacterSpanToken(sp, no_hyphen);
						continue;
					}
				}

				/// try lower case
				tok = tok.ToLower();
				lookup_tok = lookup_tok.ToLower();
				if (check_lexicon(lookup_tok))
				{
					yield return new CharacterSpanToken(sp, tok);
					continue;
				}

				/// check clitic again now that it's lower case.
				if (some_clitics.Any(clitic => lookup_tok.EndsWith(clitic) &&
					check_lexicon(stem = lookup_tok.Remove(lookup_tok.Length - clitic.Length))))
				{
					int ei = sp.StartIndex + stem.Length - 1;
					yield return new CharacterSpanToken(new Span(sp.StartIndex, ei), stem);
					yield return new CharacterSpanToken(ei + 1, sp.EndIndex);
					continue;
				}

				/// getting desperate now, if truncating some common affixes results in the stem being found in lower 
				/// case, then use the lower case version
				if (some_affixes.Any(affix => lookup_tok.EndsWith(affix) &&
									check_lexicon(lookup_tok.Remove(lookup_tok.Length - affix.Length))))
				{
					yield return new CharacterSpanToken(sp, tok);
					continue;
				}

				/// Hmm. try title case.
				if (check_lexicon(CultureInfo.InvariantCulture.TextInfo.ToTitleCase(lookup_tok)))
				{
					yield return new CharacterSpanToken(sp, CultureInfo.CurrentCulture.TextInfo.ToTitleCase(tok));
					continue;
				}

				/// grr. just send the token with original capitalization
				yield return new CharacterSpanToken(sp);
			}
		}
	}
}
Grammar Composition System