using System; using System.Globalization; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading; using miew.Tokenization; namespace erg_funcs { /// the following temporary expedient attempts to get capitalization more right /// than we used to do in generator outputs. still, for acronyms like `IBM' or /// complex names including lower case elements, i see no alternative to using /// ORTH to spell out the actual (canonical) surface form. that would seem to /// require that we re-view assumptions about capitalization across the lexicon /// et al. but the LKB should probably do that one day! (30-aug-05; oe) /// --- as of late, the ERG lexicon actually contains (some) ORTH values that /// reflect canonical capitalization; the modified code below will now try to /// either (a) respect the orthography from the lexicon, as long as it contains /// at least one upper-case letter and is string-equal() to the inflected form /// (which tends to be true for proper names at least :-) or (b) invoke the old /// heuristics to try and guess appropriate capitalization. still not quite a /// perfect solution, but to do better i now think the morphology would have to /// stop upcasing things as soon as one of the inflectional rules applies. /// (18-dec-06; oe) /// /// (defun gen-extract-surface (edge &optional (initialp t) &key cliticp stream) /// (if stream /// (let ((daughters (edge-children edge))) /// (if daughters /// (loop /// for daughter in daughters /// for foo = initialp then nil /// do /// (setf cliticp /// (gen-extract-surface /// daughter foo :cliticp cliticp :stream stream)) /// #+:logon finally /// #+:logon /// (setf (edge-lnk edge) /// (mrs::combine-lnks /// (edge-lnk (first daughters)) /// (edge-lnk (first (last daughters)))))) /// (let* ((entry (get-lex-entry-from-id (first (edge-lex-ids edge)))) /// (orth (format nil "~{~a~^ ~}" (lex-entry-orth entry))) /// /// ;; need to fix-up irregular cases like `Englishmen' manually :-{ /// /// (orth (if (ppcre::scan "man$" orth) /// (subseq orth 0 (- (length orth) 3)) /// orth)) /// (tdfs (and entry (lex-entry-full-fs entry))) /// (type (and tdfs (type-of-fs (tdfs-indef tdfs)))) /// (string (string-downcase (copy-seq (first (edge-leaves edge))))) /// /// ;; _fix_me_ /// ;; maybe we could be more courageous and just search for .orth. /// ;; as a sub-sequence of .string., starting at position .prefix. /// ;; (22-dec-06; oe) /// /// (prefix (loop /// for c across string /// while (member c '(#\( #\" #\') :test #'char=) /// count 1)) /// (suffix (min (length string) (+ prefix (length orth)))) /// (suffix (when (string-equal /// orth string :start2 prefix :end2 suffix) /// suffix)) /// (rawp (and suffix /// (loop for c across orth thereis (upper-case-p c)))) /// (capitalizep /// (ignore-errors /// (loop /// for match in '(basic_n_proper_lexent /// n_-_c-month_le /// n_-_c-dow_le /// n_-_pr-i_le) /// thereis (or (eq type match) /// (subtype-p type match))))) /// (cliticp (or cliticp /// (and (> (length string) 0) /// (char= (char string 0) #\'))))) /// (if rawp /// (setf string /// (concatenate 'string /// (subseq string 0 prefix) orth (subseq string suffix))) /// (when capitalizep /// (loop /// with spacep = t /// for i from 0 to (- (length string) 1) /// for c = (schar string i) /// when (char= c #\Space) do (setf spacep t) /// else when (char= c #\_) /// do /// (setf spacep t) /// (setf (schar string i) #\Space) /// else do /// (when (and spacep (alphanumericp c)) /// (setf (schar string i) (char-upcase c))) /// (setf spacep nil)))) /// (when (and (> (length string) 1) /// (char= (char string 0) #\_) /// (upper-case-p (char string 1))) /// (setf string (subseq string 1))) /// (when (and initialp (alphanumericp (schar string 0))) /// (setf (schar string 0) (char-upcase (schar string 0)))) /// (unless (or initialp cliticp) /// (format stream " ")) /// (let (#+:logon /// (start (file-position stream))) /// (loop /// with hyphenp /// for c across string /// unless (and hyphenp (char= c #\space)) /// do (write-char c stream) /// when (char= c #\-) do (setf hyphenp t) /// else do (setf hyphenp nil)) /// #+:logon /// (setf (edge-lnk edge) /// (list :characters start (file-position stream)))) /// /// ;; finally, inform the caller as to whether we output something that /// ;; inhibits intervening space (e.g. `mid-July'). /// /// (unless (string= orth "") /// (member (schar orth (- (length orth) 1)) '(#\-) :test #'char=))))) /// (let ((stream (make-string-output-stream))) /// (gen-extract-surface edge initialp :stream stream) /// (get-output-stream-string stream)))) public class ErgTokenizer : ITokenizer { Func<String, bool> check_lexicon; public ErgTokenizer(Func<String, bool> check_lexicon) { this.check_lexicon = check_lexicon; } readonly static Char[] one_space = new Char[] { ' ' }; readonly static Char[] punct = new Char[] { '.', ',', '!', '(', ')', '?' }; readonly static String[] some_affixes = { "s", "ing", "ed" }; readonly static String[] some_clitics = { "'s", "'ll", "'m", "'t", "'re", "'d", "'ve", "'bout", "'clock", "'kay", "'em", "'all" }; /// <summary> /// First of all, just break on spaces /// </summary> IEnumerable<Span> inner_tokenize(String s_input) { int ix, ix_prev = 0; while ((ix = s_input.IndexOfAny(one_space, ix_prev)) != -1) { if (ix > ix_prev) yield return new Span(ix_prev, ix - 1); ix_prev = ix + 1; } if (s_input.Length > ix_prev) yield return new Span(ix_prev, s_input.Length - 1); } ///// <summary> ///// Check the tokens to see if some altered version should be submitted instead ///// </summary> public IEnumerable<ICharacterSpan> Tokenize(string s_input) { foreach (Span sp in inner_tokenize(s_input)) { String tok = s_input.Substring(sp.StartIndex, sp.Length); if (check_lexicon(tok)) { yield return new CharacterSpanToken(sp, tok); continue; } /// trim start and end punctuation and remember how much we took from each String stem = null; String lookup_tok = tok.Trim(punct); if (some_clitics.Any(clitic => lookup_tok.EndsWith(clitic) && check_lexicon(stem = lookup_tok.Remove(lookup_tok.Length - clitic.Length)))) { int ei = sp.StartIndex + stem.Length - 1; yield return new CharacterSpanToken(new Span(sp.StartIndex, ei), stem); yield return new CharacterSpanToken(ei + 1, sp.EndIndex); continue; } /// "snow-clearing" -> |snow-| |clearing| String[] parts = lookup_tok.Split('-'); if (parts.Length == 2) { if (check_lexicon(parts[0]) && check_lexicon(parts[1])) { int ix = sp.StartIndex + parts[0].Length; yield return new CharacterSpanToken(sp.StartIndex, ix); yield return new CharacterSpanToken(ix + 1, sp.EndIndex); continue; } String no_hyphen = parts[0] + parts[1]; if (check_lexicon(no_hyphen)) { yield return new CharacterSpanToken(sp, no_hyphen); continue; } } /// try lower case tok = tok.ToLower(); lookup_tok = lookup_tok.ToLower(); if (check_lexicon(lookup_tok)) { yield return new CharacterSpanToken(sp, tok); continue; } /// check clitic again now that it's lower case. if (some_clitics.Any(clitic => lookup_tok.EndsWith(clitic) && check_lexicon(stem = lookup_tok.Remove(lookup_tok.Length - clitic.Length)))) { int ei = sp.StartIndex + stem.Length - 1; yield return new CharacterSpanToken(new Span(sp.StartIndex, ei), stem); yield return new CharacterSpanToken(ei + 1, sp.EndIndex); continue; } /// getting desperate now, if truncating some common affixes results in the stem being found in lower /// case, then use the lower case version if (some_affixes.Any(affix => lookup_tok.EndsWith(affix) && check_lexicon(lookup_tok.Remove(lookup_tok.Length - affix.Length)))) { yield return new CharacterSpanToken(sp, tok); continue; } /// Hmm. try title case. if (check_lexicon(CultureInfo.InvariantCulture.TextInfo.ToTitleCase(lookup_tok))) { yield return new CharacterSpanToken(sp, CultureInfo.CurrentCulture.TextInfo.ToTitleCase(tok)); continue; } /// grr. just send the token with original capitalization yield return new CharacterSpanToken(sp); } } } }