Пример #1
0
        public void Parse(ISpan span)
        {
            var textSpan = span.ValueAsSpan;
            int spanBegin = span.Begin;
            int begin = 0, end = textSpan.IndexOf(' ');

            while (end >= 0)
            {
                if (!textSpan.Slice(0, end).IsNullOrWhiteSpace())
                {
                    span.AddToken(spanBegin + begin, spanBegin + begin + end - 1);
                }
                textSpan = textSpan.Slice(end + 1);
                begin   += end + 1;
                end      = textSpan.IndexOf(' ');
            }

            if (begin < span.Length)
            {
                if (!span.ValueAsSpan.Slice(begin, span.Length - begin).IsNullOrWhiteSpace())
                {
                    span.AddToken(spanBegin + begin, spanBegin + span.Length - 1);
                }
            }
        }
Пример #2
0
        private static List <IDocument> ReadCorpus(List <string> trainDocuments, HashSet <string> arcNames, Language language, bool isOntoNotes = false)
        {
            if (trainDocuments is null)
            {
                return(new List <IDocument>());
            }
            var allLines = new List <string>();

            foreach (var f in trainDocuments)
            {
                if (isOntoNotes)
                {
                    allLines.Add("# newdoc");  //Force doc splits
                    allLines.Add("# sent_id"); //Force doc splits
                    allLines.AddRange(File.ReadAllLines(f).Select(l => string.IsNullOrWhiteSpace(l) ? "# sent_id" : l));
                }
                else
                {
                    allLines.AddRange(File.ReadAllLines(f).Where(l => !string.IsNullOrWhiteSpace(l)));
                }
            }

            var documents = new List <IDocument>();


            var docLines = new List <List <string> >();

            foreach (var line in allLines)
            {
                if (line.StartsWith("# newdoc"))
                {
                    docLines.Add(new List <string>());
                }
                else
                {
                    if (docLines.Count == 0)
                    {
                        docLines.Add(new List <string>());
                    }
                    docLines.Last().Add(line);
                }
            }

            foreach (var docline in docLines)
            {
                var  doc        = new Document();
                bool invalidDoc = false;

                ISpan span = null;
                var   sb   = new StringBuilder();
                foreach (var l in docline)
                {
                    if (l.StartsWith("# sent_id"))
                    {
                        span = doc.AddSpan(sb.Length, sb.Length);
                        //if(l.Contains("email-enronsent")) { invalidDoc = true; }
                    }
                    else if (!l.StartsWith("#"))
                    {
                        var parts = l.Split('\t');
                        if (parts[0].Contains("-"))
                        {
                            continue;
                        }                                         //Pseudo-token, such as cannot -> proceed by can + not

                        double index;
                        if (double.TryParse(parts[0], out index))
                        {
                            if ((int)(index * 10) == ((int)index) * 10)
                            {
                                string       txt   = parts[1];
                                string       lemma = parts[2];
                                string       pos   = parts[3];
                                PartOfSpeech POS;
                                bool         spaceAfter = false;


                                if (isOntoNotes)
                                {
                                    POS = PartOfSpeechHelpers.EnglishPennToUniversal[pos];
                                }
                                else
                                {
                                    POS        = (PartOfSpeech)Enum.Parse(typeof(PartOfSpeech), pos);
                                    spaceAfter = parts[9].Contains("SpaceAfter=No");
                                }
                                //if (PartOfSpeechHelpers.StringPOS.Contains(pos))
                                //{
                                //    POS = (PartOfSpeechEnum)Enum.Parse(typeof(PartOfSpeechEnum), pos);
                                //}
                                //else
                                //{
                                //    if (language == LanguageEnum.English)
                                //    {
                                //        if (!PartOfSpeechHelpers.EnglishPennToUniversal.TryGetValue(pos, out POS))
                                //        {
                                //            throw new Exception("Invalid tag: " + pos);
                                //        }
                                //    }
                                //    else
                                //    {
                                //        throw new Exception("Invalid tag: " + pos);
                                //    }
                                //}

                                if (language == Language.English)
                                {
                                    //Should add more exceptions here on how we handle tokenization differently than the original Conll data
                                    if ((txt.ToLowerInvariant() == "'s" || txt.ToLowerInvariant() == "s") && (lemma.ToLowerInvariant() == "be" || POS == PartOfSpeech.VERB || POS == PartOfSpeech.AUX))
                                    {
                                        txt = "is";
                                    }
                                    else if ((txt.ToLowerInvariant() == "'m" || txt.ToLowerInvariant() == "m") && (lemma.ToLowerInvariant() == "be" || POS == PartOfSpeech.VERB || POS == PartOfSpeech.AUX))
                                    {
                                        txt = "am";
                                    }
                                    else if ((txt.ToLowerInvariant() == "'re" || txt.ToLowerInvariant() == "re") && (lemma.ToLowerInvariant() == "be" || POS == PartOfSpeech.VERB))
                                    {
                                        txt = "are";
                                    }
                                    else if ((txt.ToLowerInvariant() == "ll" || txt.ToLowerInvariant() == "'ll") && (POS == PartOfSpeech.VERB || POS == PartOfSpeech.AUX))
                                    {
                                        txt = "will";
                                    }
                                    else if (txt.ToLowerInvariant() == "'d" && (POS == PartOfSpeech.AUX))
                                    {
                                        txt = "would";
                                    }
                                    else if (txt.ToLowerInvariant() == "'d" && (POS == PartOfSpeech.VERB))
                                    {
                                        txt = "had";
                                    }
                                    else if (txt.ToLowerInvariant() == "n't")
                                    {
                                        txt = "not";
                                    }
                                    else if (txt.ToLowerInvariant() == "'ve")
                                    {
                                        txt = "have";
                                    }
                                    else if (txt.Length > 1 && txt.StartsWith("/") && pos == ".")
                                    {
                                        txt = txt.Substring(1);
                                    }
                                    else if (txt == "'" && lemma == "'s" && (POS == PartOfSpeech.PART || POS == PartOfSpeech.PRON))
                                    {
                                        // ok
                                    }
                                    else if (txt.StartsWith("'") && !(txt == "'s" && POS == PartOfSpeech.PART) &&
                                             !(txt == "'" && POS == PartOfSpeech.PART) &&
                                             !(txt == "'s" && POS == PartOfSpeech.PRON) &&
                                             !(txt == "'" && POS == PartOfSpeech.PUNCT))
                                    {
                                        File.AppendAllLines("missing_contractions.txt", new string[] { l.Split(new char[] { '\t' }, 2).Last() });
                                    }
                                    else if (lemma == "#hlink#" && txt.Contains("://"))
                                    {
                                        txt = "http://" + txt;
                                    }
                                }
                                //'d

                                int begin = sb.Length;
                                int end   = begin + txt.Length - 1;
                                sb.Append(txt + " ");
                                span.End = sb.Length - 1;
                                var token = span.AddToken(begin, end);
                                token.POS = POS;
                                int    head    = int.Parse(parts[isOntoNotes ? 5 : 6]) - 1;
                                string arcType = parts[isOntoNotes ? 6 : 7].ToLowerInvariant().Split(':').First();

                                //if (parts[5].Contains("Foreign=Yes"))
                                //{
                                //    invalidDoc = true;
                                //}

                                token.Head           = head;
                                token.DependencyType = arcType;

                                if (!arcNames.Contains(arcType))
                                {
                                    arcNames.Add(arcType);
                                }
                            }
                        }
                    }
                }
                doc.Value = sb.ToString();
                doc.TrimTokens();
                if (!invalidDoc)
                {
                    documents.Add(doc);
                }
                else
                {
                    Logger.LogInformation("skipping document:\n" + doc.TokenizedValue + "\n");
                }
            }


            return(documents);
        }
Пример #3
0
        public void Parse(ISpan span)
        {
            //TODO: store if a splitpoint is special case, do not try to fetch hash if not!
            var separators = CharacterClasses.WhitespaceCharacters;
            var textSpan   = span.ValueAsSpan;

            bool hasEmoji = false;

            for (int i = 0; i < textSpan.Length - 1; i++)
            {
                if (textSpan.Slice(i).IsEmoji(out _))
                {
                    hasEmoji = true; break;
                }
            }

            var splitPoints = new List <SplitPoint>(textSpan.Length / 4);

            int offset = 0, sufix_offset = 0;

            while (true)
            {
                if (splitPoints.Count > textSpan.Length)
                {
                    throw new InvalidOperationException(); //If we found more splitting points than actual characters on the span, we hit a bug in the tokenizer
                }

                offset      += sufix_offset;
                sufix_offset = 0;
                if (offset > textSpan.Length)
                {
                    break;
                }
                var splitPoint = textSpan.IndexOfAny(separators, offset);
                ReadOnlySpan <char> candidate;

                if (splitPoint == offset)
                {
                    //Happens on sequential separators
                    offset++; continue;
                }

                if (splitPoint < 0)
                {
                    candidate  = textSpan.Slice(offset);
                    splitPoint = offset + candidate.Length;
                    if (candidate.Length == 0)
                    {
                        break;
                    }
                }
                else
                {
                    candidate = textSpan.Slice(offset, splitPoint - offset);
                }

                //Special case to split also at emojis
                if (hasEmoji)
                {
                    for (int i = 0; i < (candidate.Length - 1); i++)
                    {
                        if (candidate.Slice(i).IsEmoji(out var emojiLength))
                        {
                            if (i == 0)
                            {
                                splitPoint = offset + emojiLength - 1;
                                candidate  = candidate.Slice(0, emojiLength);
                            }
                            else
                            {
                                splitPoint = offset + i - 1;
                                candidate  = candidate.Slice(0, i);
                            }
                            break;
                        }
                    }
                }

                while (!candidate.IsEmpty)
                {
                    int hash = candidate.CaseSensitiveHash32();
                    if (SpecialCases.ContainsKey(hash))
                    {
                        splitPoints.Add(new SplitPoint(offset, splitPoint - 1, SplitPointReason.Exception));
                        candidate = new ReadOnlySpan <char>();
                        offset    = splitPoint + 1;
                        continue;
                    }
                    else if (candidate.IsLikeURLorEmail())
                    {
                        splitPoints.Add(new SplitPoint(offset, splitPoint - 1, SplitPointReason.EmailOrUrl));
                        candidate = new ReadOnlySpan <char>();
                        offset    = splitPoint + 1;
                        continue;
                    }
                    else if (hasEmoji && candidate.IsEmoji(out var emojiLength))
                    {
                        splitPoints.Add(new SplitPoint(offset, offset + emojiLength - 1, SplitPointReason.Emoji));
                        candidate = candidate.Slice(emojiLength);
                        offset   += emojiLength;
                        continue;
                    }
                    else
                    {
                        if (candidate.Length == 1)
                        {
                            splitPoints.Add(new SplitPoint(offset, offset, SplitPointReason.SingleChar));
                            candidate = new ReadOnlySpan <char>();
                            offset    = splitPoint + 1;
                            continue;
                        }

                        if (!candidate.IsAllLetterOrDigit())
                        {
                            if (candidate.IsSentencePunctuation() || candidate.IsHyphen() || candidate.IsSymbol())
                            {
                                splitPoints.Add(new SplitPoint(offset, splitPoint - 1, SplitPointReason.Punctuation));
                                candidate = new ReadOnlySpan <char>();
                                offset    = splitPoint + 1;
                                continue;
                            }

                            int prefixLocation = FindPrefix(candidate);
                            if (prefixLocation >= 0)
                            {
                                splitPoints.Add(new SplitPoint(offset + prefixLocation, offset + prefixLocation, SplitPointReason.Prefix));
                                candidate = candidate.Slice(prefixLocation + 1);
                                offset   += prefixLocation + 1;
                                continue;
                            }

                            var(sufixIndex, sufixLength) = FindSufix(candidate);

                            if (sufixIndex > -1)
                            {
                                splitPoints.Add(new SplitPoint(offset + sufixIndex, offset + sufixIndex + sufixLength - 1, SplitPointReason.Sufix));
                                candidate     = candidate.Slice(0, sufixIndex);
                                splitPoint    = offset + sufixIndex;
                                sufix_offset += sufixLength;
                                continue;
                            }

                            var infixLocation = FindInfix(candidate);
                            if (infixLocation.Count > 0)
                            {
                                int in_offset = offset;

                                foreach (var(index, length) in infixLocation)
                                {
                                    if ((offset + index - 1) >= in_offset)
                                    {
                                        splitPoints.Add(new SplitPoint(in_offset, offset + index - 1, SplitPointReason.Infix));
                                    }

                                    //Test if the remaining is not an exception first
                                    if ((in_offset - offset + index) <= candidate.Length)
                                    {
                                        var rest     = candidate.Slice(in_offset - offset + index);
                                        int hashRest = rest.CaseSensitiveHash32();

                                        if (SpecialCases.ContainsKey(hashRest))
                                        {
                                            in_offset = offset + index;
                                            break;
                                        }
                                    }
                                    in_offset = offset + index + length;
                                    splitPoints.Add(new SplitPoint(offset + index, offset + index + length - 1, SplitPointReason.Infix));
                                }

                                candidate = candidate.Slice(in_offset - offset);

                                offset = in_offset;
                                continue;
                            }
                        }
                    }

                    splitPoints.Add(new SplitPoint(offset, splitPoint - 1, SplitPointReason.Normal));
                    candidate = new ReadOnlySpan <char>();
                    offset    = splitPoint + 1;
                }
            }

            int spanBegin = span.Begin;
            int pB = int.MinValue, pE = int.MinValue;

            span.ReserveTokens(splitPoints.Count);
            foreach (var sp in splitPoints.OrderBy(s => s.Begin).ThenBy(s => s.End))
            {
                int b = sp.Begin;
                int e = sp.End;

                if (pB == b && pE == e)
                {
                    continue;
                }
                pB = b; pE = e;

                if (b > e)
                {
                    Logger.LogError("Error processing text: '{DOC}', found token with begin={b} and end={e}", span.Value, b, e);
                    throw new InvalidOperationException();
                }

                while (char.IsWhiteSpace(textSpan[b]) && b < e)
                {
                    b++;
                }

                while (char.IsWhiteSpace(textSpan[e]) && e > b)
                {
                    e--;
                }

                int hash = textSpan.Slice(b, e - b + 1).CaseSensitiveHash32();

                if (e < b)
                {
                    Logger.LogError("Error processing text: '{DOC}', found token with begin={b} and end={e}", span.Value, b, e);
                    continue;
                }

                if (SpecialCases.TryGetValue(hash, out TokenizationException exp))
                {
                    if (exp.Replacements is null)
                    {
                        var tk = span.AddToken(spanBegin + b, spanBegin + e);
                    }
                    else
                    {
                        //TODO: Tokens begins and ends are being artificially placed here, check in the future how to better handle this
                        int begin2 = spanBegin + b;
                        for (int i = 0; i < exp.Replacements.Length; i++)
                        {
                            //Adds replacement tokens sequentially, consuming one char from the original document at a time, and
                            //using the remaing chars in the last replacement token
                            var tk = span.AddToken(begin2, ((i == exp.Replacements.Length - 1) ? (spanBegin + e) : begin2));
                            tk.Replacement = exp.Replacements[i];
                            begin2++;
                        }
                    }
                }
                else
                {
                    var tk = span.AddToken(spanBegin + b, spanBegin + e);
                    if (sp.Reason == SplitPointReason.EmailOrUrl && !DisableEmailOrURLCapture)
                    {
                        tk.AddEntityType(new EntityType("EmailOrURL", EntityTag.Single));
                    }
                }
            }
        }