public void Parse(ISpan span) { var textSpan = span.ValueAsSpan; int spanBegin = span.Begin; int begin = 0, end = textSpan.IndexOf(' '); while (end >= 0) { if (!textSpan.Slice(0, end).IsNullOrWhiteSpace()) { span.AddToken(spanBegin + begin, spanBegin + begin + end - 1); } textSpan = textSpan.Slice(end + 1); begin += end + 1; end = textSpan.IndexOf(' '); } if (begin < span.Length) { if (!span.ValueAsSpan.Slice(begin, span.Length - begin).IsNullOrWhiteSpace()) { span.AddToken(spanBegin + begin, spanBegin + span.Length - 1); } } }
private static List <IDocument> ReadCorpus(List <string> trainDocuments, HashSet <string> arcNames, Language language, bool isOntoNotes = false) { if (trainDocuments is null) { return(new List <IDocument>()); } var allLines = new List <string>(); foreach (var f in trainDocuments) { if (isOntoNotes) { allLines.Add("# newdoc"); //Force doc splits allLines.Add("# sent_id"); //Force doc splits allLines.AddRange(File.ReadAllLines(f).Select(l => string.IsNullOrWhiteSpace(l) ? "# sent_id" : l)); } else { allLines.AddRange(File.ReadAllLines(f).Where(l => !string.IsNullOrWhiteSpace(l))); } } var documents = new List <IDocument>(); var docLines = new List <List <string> >(); foreach (var line in allLines) { if (line.StartsWith("# newdoc")) { docLines.Add(new List <string>()); } else { if (docLines.Count == 0) { docLines.Add(new List <string>()); } docLines.Last().Add(line); } } foreach (var docline in docLines) { var doc = new Document(); bool invalidDoc = false; ISpan span = null; var sb = new StringBuilder(); foreach (var l in docline) { if (l.StartsWith("# sent_id")) { span = doc.AddSpan(sb.Length, sb.Length); //if(l.Contains("email-enronsent")) { invalidDoc = true; } } else if (!l.StartsWith("#")) { var parts = l.Split('\t'); if (parts[0].Contains("-")) { continue; } //Pseudo-token, such as cannot -> proceed by can + not double index; if (double.TryParse(parts[0], out index)) { if ((int)(index * 10) == ((int)index) * 10) { string txt = parts[1]; string lemma = parts[2]; string pos = parts[3]; PartOfSpeech POS; bool spaceAfter = false; if (isOntoNotes) { POS = PartOfSpeechHelpers.EnglishPennToUniversal[pos]; } else { POS = (PartOfSpeech)Enum.Parse(typeof(PartOfSpeech), pos); spaceAfter = parts[9].Contains("SpaceAfter=No"); } //if (PartOfSpeechHelpers.StringPOS.Contains(pos)) //{ // POS = (PartOfSpeechEnum)Enum.Parse(typeof(PartOfSpeechEnum), pos); //} //else //{ // if (language == LanguageEnum.English) // { // if (!PartOfSpeechHelpers.EnglishPennToUniversal.TryGetValue(pos, out POS)) // { // throw new Exception("Invalid tag: " + pos); // } // } // else // { // throw new Exception("Invalid tag: " + pos); // } //} if (language == Language.English) { //Should add more exceptions here on how we handle tokenization differently than the original Conll data if ((txt.ToLowerInvariant() == "'s" || txt.ToLowerInvariant() == "s") && (lemma.ToLowerInvariant() == "be" || POS == PartOfSpeech.VERB || POS == PartOfSpeech.AUX)) { txt = "is"; } else if ((txt.ToLowerInvariant() == "'m" || txt.ToLowerInvariant() == "m") && (lemma.ToLowerInvariant() == "be" || POS == PartOfSpeech.VERB || POS == PartOfSpeech.AUX)) { txt = "am"; } else if ((txt.ToLowerInvariant() == "'re" || txt.ToLowerInvariant() == "re") && (lemma.ToLowerInvariant() == "be" || POS == PartOfSpeech.VERB)) { txt = "are"; } else if ((txt.ToLowerInvariant() == "ll" || txt.ToLowerInvariant() == "'ll") && (POS == PartOfSpeech.VERB || POS == PartOfSpeech.AUX)) { txt = "will"; } else if (txt.ToLowerInvariant() == "'d" && (POS == PartOfSpeech.AUX)) { txt = "would"; } else if (txt.ToLowerInvariant() == "'d" && (POS == PartOfSpeech.VERB)) { txt = "had"; } else if (txt.ToLowerInvariant() == "n't") { txt = "not"; } else if (txt.ToLowerInvariant() == "'ve") { txt = "have"; } else if (txt.Length > 1 && txt.StartsWith("/") && pos == ".") { txt = txt.Substring(1); } else if (txt == "'" && lemma == "'s" && (POS == PartOfSpeech.PART || POS == PartOfSpeech.PRON)) { // ok } else if (txt.StartsWith("'") && !(txt == "'s" && POS == PartOfSpeech.PART) && !(txt == "'" && POS == PartOfSpeech.PART) && !(txt == "'s" && POS == PartOfSpeech.PRON) && !(txt == "'" && POS == PartOfSpeech.PUNCT)) { File.AppendAllLines("missing_contractions.txt", new string[] { l.Split(new char[] { '\t' }, 2).Last() }); } else if (lemma == "#hlink#" && txt.Contains("://")) { txt = "http://" + txt; } } //'d int begin = sb.Length; int end = begin + txt.Length - 1; sb.Append(txt + " "); span.End = sb.Length - 1; var token = span.AddToken(begin, end); token.POS = POS; int head = int.Parse(parts[isOntoNotes ? 5 : 6]) - 1; string arcType = parts[isOntoNotes ? 6 : 7].ToLowerInvariant().Split(':').First(); //if (parts[5].Contains("Foreign=Yes")) //{ // invalidDoc = true; //} token.Head = head; token.DependencyType = arcType; if (!arcNames.Contains(arcType)) { arcNames.Add(arcType); } } } } } doc.Value = sb.ToString(); doc.TrimTokens(); if (!invalidDoc) { documents.Add(doc); } else { Logger.LogInformation("skipping document:\n" + doc.TokenizedValue + "\n"); } } return(documents); }
public void Parse(ISpan span) { //TODO: store if a splitpoint is special case, do not try to fetch hash if not! var separators = CharacterClasses.WhitespaceCharacters; var textSpan = span.ValueAsSpan; bool hasEmoji = false; for (int i = 0; i < textSpan.Length - 1; i++) { if (textSpan.Slice(i).IsEmoji(out _)) { hasEmoji = true; break; } } var splitPoints = new List <SplitPoint>(textSpan.Length / 4); int offset = 0, sufix_offset = 0; while (true) { if (splitPoints.Count > textSpan.Length) { throw new InvalidOperationException(); //If we found more splitting points than actual characters on the span, we hit a bug in the tokenizer } offset += sufix_offset; sufix_offset = 0; if (offset > textSpan.Length) { break; } var splitPoint = textSpan.IndexOfAny(separators, offset); ReadOnlySpan <char> candidate; if (splitPoint == offset) { //Happens on sequential separators offset++; continue; } if (splitPoint < 0) { candidate = textSpan.Slice(offset); splitPoint = offset + candidate.Length; if (candidate.Length == 0) { break; } } else { candidate = textSpan.Slice(offset, splitPoint - offset); } //Special case to split also at emojis if (hasEmoji) { for (int i = 0; i < (candidate.Length - 1); i++) { if (candidate.Slice(i).IsEmoji(out var emojiLength)) { if (i == 0) { splitPoint = offset + emojiLength - 1; candidate = candidate.Slice(0, emojiLength); } else { splitPoint = offset + i - 1; candidate = candidate.Slice(0, i); } break; } } } while (!candidate.IsEmpty) { int hash = candidate.CaseSensitiveHash32(); if (SpecialCases.ContainsKey(hash)) { splitPoints.Add(new SplitPoint(offset, splitPoint - 1, SplitPointReason.Exception)); candidate = new ReadOnlySpan <char>(); offset = splitPoint + 1; continue; } else if (candidate.IsLikeURLorEmail()) { splitPoints.Add(new SplitPoint(offset, splitPoint - 1, SplitPointReason.EmailOrUrl)); candidate = new ReadOnlySpan <char>(); offset = splitPoint + 1; continue; } else if (hasEmoji && candidate.IsEmoji(out var emojiLength)) { splitPoints.Add(new SplitPoint(offset, offset + emojiLength - 1, SplitPointReason.Emoji)); candidate = candidate.Slice(emojiLength); offset += emojiLength; continue; } else { if (candidate.Length == 1) { splitPoints.Add(new SplitPoint(offset, offset, SplitPointReason.SingleChar)); candidate = new ReadOnlySpan <char>(); offset = splitPoint + 1; continue; } if (!candidate.IsAllLetterOrDigit()) { if (candidate.IsSentencePunctuation() || candidate.IsHyphen() || candidate.IsSymbol()) { splitPoints.Add(new SplitPoint(offset, splitPoint - 1, SplitPointReason.Punctuation)); candidate = new ReadOnlySpan <char>(); offset = splitPoint + 1; continue; } int prefixLocation = FindPrefix(candidate); if (prefixLocation >= 0) { splitPoints.Add(new SplitPoint(offset + prefixLocation, offset + prefixLocation, SplitPointReason.Prefix)); candidate = candidate.Slice(prefixLocation + 1); offset += prefixLocation + 1; continue; } var(sufixIndex, sufixLength) = FindSufix(candidate); if (sufixIndex > -1) { splitPoints.Add(new SplitPoint(offset + sufixIndex, offset + sufixIndex + sufixLength - 1, SplitPointReason.Sufix)); candidate = candidate.Slice(0, sufixIndex); splitPoint = offset + sufixIndex; sufix_offset += sufixLength; continue; } var infixLocation = FindInfix(candidate); if (infixLocation.Count > 0) { int in_offset = offset; foreach (var(index, length) in infixLocation) { if ((offset + index - 1) >= in_offset) { splitPoints.Add(new SplitPoint(in_offset, offset + index - 1, SplitPointReason.Infix)); } //Test if the remaining is not an exception first if ((in_offset - offset + index) <= candidate.Length) { var rest = candidate.Slice(in_offset - offset + index); int hashRest = rest.CaseSensitiveHash32(); if (SpecialCases.ContainsKey(hashRest)) { in_offset = offset + index; break; } } in_offset = offset + index + length; splitPoints.Add(new SplitPoint(offset + index, offset + index + length - 1, SplitPointReason.Infix)); } candidate = candidate.Slice(in_offset - offset); offset = in_offset; continue; } } } splitPoints.Add(new SplitPoint(offset, splitPoint - 1, SplitPointReason.Normal)); candidate = new ReadOnlySpan <char>(); offset = splitPoint + 1; } } int spanBegin = span.Begin; int pB = int.MinValue, pE = int.MinValue; span.ReserveTokens(splitPoints.Count); foreach (var sp in splitPoints.OrderBy(s => s.Begin).ThenBy(s => s.End)) { int b = sp.Begin; int e = sp.End; if (pB == b && pE == e) { continue; } pB = b; pE = e; if (b > e) { Logger.LogError("Error processing text: '{DOC}', found token with begin={b} and end={e}", span.Value, b, e); throw new InvalidOperationException(); } while (char.IsWhiteSpace(textSpan[b]) && b < e) { b++; } while (char.IsWhiteSpace(textSpan[e]) && e > b) { e--; } int hash = textSpan.Slice(b, e - b + 1).CaseSensitiveHash32(); if (e < b) { Logger.LogError("Error processing text: '{DOC}', found token with begin={b} and end={e}", span.Value, b, e); continue; } if (SpecialCases.TryGetValue(hash, out TokenizationException exp)) { if (exp.Replacements is null) { var tk = span.AddToken(spanBegin + b, spanBegin + e); } else { //TODO: Tokens begins and ends are being artificially placed here, check in the future how to better handle this int begin2 = spanBegin + b; for (int i = 0; i < exp.Replacements.Length; i++) { //Adds replacement tokens sequentially, consuming one char from the original document at a time, and //using the remaing chars in the last replacement token var tk = span.AddToken(begin2, ((i == exp.Replacements.Length - 1) ? (spanBegin + e) : begin2)); tk.Replacement = exp.Replacements[i]; begin2++; } } } else { var tk = span.AddToken(spanBegin + b, spanBegin + e); if (sp.Reason == SplitPointReason.EmailOrUrl && !DisableEmailOrURLCapture) { tk.AddEntityType(new EntityType("EmailOrURL", EntityTag.Single)); } } } }