// Methods public List <INGram> Do (string text, ITokenizationStrategy strategy, INGramTokenizerRuleSet ruleSet) { Validator.ValidateStringNullOrWhiteSpace(text, nameof(text)); Validator.ValidateObject(strategy, nameof(strategy)); Validator.ValidateObject(ruleSet, nameof(ruleSet)); return(TokenizeText(text, strategy, ruleSet)); }
// Constructor(s) protected ANGram(ushort n, ITokenizationStrategy strategy, string value) { Validator.ValidateN(n); Validator.ValidateObject(strategy, nameof(strategy)); Validator.ValidateStringNullOrWhiteSpace(value, nameof(value)); N = n; Strategy = strategy; Value = value; }
// Methods (public) public LabeledExample Create (ulong id, string label, string text, ITokenizationStrategy strategy, INGramTokenizerRuleSet ruleSet) { Validator.ValidateStringNullOrWhiteSpace(label, nameof(label)); Validator.ValidateStringNullOrWhiteSpace(text, nameof(text)); Validator.ValidateObject(strategy, nameof(strategy)); Validator.ValidateObject(ruleSet, nameof(ruleSet)); List <INGram> nGrams = _tokenizer.Do(text, strategy, ruleSet); LabeledExample labeledExample = new LabeledExample(id, label, text, nGrams); return(labeledExample); }
private List <T> DoFor <T>(string text, ITokenizationStrategy strategy) where T : INGram { // "This is a sample text." => "This", "is", ..., "text" MatchCollection matches = Regex.Matches(text, strategy.Pattern); if (matches.Count == 0) { throw new Exception(MessageCollection.NGramsTokenizer_ProvidedTokenizationStrategyPatternReturnsZeroMatches.Invoke(strategy)); } ushort N = GetN <T>(); Validator.ThrowIfFirstIsGreater(N, nameof(N), matches.Count, "matches.Count"); return(GetTokens <T>(N, matches, strategy)); }
// Methods public TextClassifierResult PredictLabel (string text, ITokenizationStrategy strategy, INGramTokenizerRuleSet ruleSet, List <LabeledExample> labeledExamples) { Validator.ValidateStringNullOrWhiteSpace(text, nameof(text)); Validator.ValidateObject(strategy, nameof(strategy)); Validator.ValidateObject(ruleSet, nameof(ruleSet)); Validator.ValidateList(labeledExamples, nameof(labeledExamples)); _components.LoggingAction.Invoke(MessageCollection.TextClassifier_AttemptingToPredictLabel); string truncatedText = _components.TextTruncatingFunction.Invoke(text, _settings.TruncateTextInLogMessagesAfter); _components.LoggingAction.Invoke(MessageCollection.TextClassifier_FollowingTextHasBeenProvided.Invoke(truncatedText)); _components.LoggingAction.Invoke(MessageCollection.TextClassifier_FollowingTokenizationStrategyWillBeUsed.Invoke(strategy)); _components.LoggingAction.Invoke(MessageCollection.TextClassifier_FollowingNGramsTokenizerRuleSetWillBeUsed.Invoke(ruleSet)); _components.LoggingAction.Invoke(MessageCollection.TextClassifier_XLabeledExamplesHaveBeenProvided.Invoke(labeledExamples)); List <INGram> nGrams = _components.NGramsTokenizer.Do(text, strategy, ruleSet); _components.LoggingAction.Invoke(MessageCollection.TextClassifier_ProvidedTextHasBeenTokenizedIntoXNGrams.Invoke(nGrams)); List <SimilarityIndex> indexes = GetSimilarityIndexes(nGrams, labeledExamples); _components.LoggingAction.Invoke(MessageCollection.TextClassifier_TokenizedTextHasBeenComparedAgainstTheProvidedLabeledExamples); _components.LoggingAction.Invoke(MessageCollection.TextClassifier_XSimilarityIndexObjectsHaveBeenComputed.Invoke(indexes)); List <SimilarityIndexAverage> indexAverages = GetSimilarityIndexAverages(indexes); _components.LoggingAction.Invoke(MessageCollection.TextClassifier_XSimilarityIndexAverageObjectsHaveBeenComputed(indexAverages)); string label = PredictLabel(indexAverages); _components.LoggingAction.Invoke(MessageCollection.TextClassifier_PredictedLabelIs.Invoke(label)); if (label == null) { _components.LoggingAction.Invoke(MessageCollection.TextClassifier_PredictionHasFailedTryIncreasingTheAmountOfProvidedLabeledExamples); } else { _components.LoggingAction.Invoke(MessageCollection.TextClassifier_PredictionHasBeenSuccessful); } TextClassifierResult result = new TextClassifierResult(label, indexes, indexAverages); return(result); }
// Methods (private) private List <INGram> TokenizeText (string text, ITokenizationStrategy strategy, INGramTokenizerRuleSet ruleSet) { List <INGram> nGrams = new List <INGram>(); string ruleName = null; try { if (ruleSet.DoForMonograms) { ruleName = nameof(ruleSet.DoForMonograms); nGrams.AddRange(DoFor <Monogram>(text, strategy)); } if (ruleSet.DoForBigrams) { ruleName = nameof(ruleSet.DoForBigrams); nGrams.AddRange(DoFor <Bigram>(text, strategy)); } if (ruleSet.DoForTrigrams) { ruleName = nameof(ruleSet.DoForTrigrams); nGrams.AddRange(DoFor <Trigram>(text, strategy)); } if (ruleSet.DoForFourgrams) { ruleName = nameof(ruleSet.DoForFourgrams); nGrams.AddRange(DoFor <Fourgram>(text, strategy)); } if (ruleSet.DoForFivegrams) { ruleName = nameof(ruleSet.DoForFivegrams); nGrams.AddRange(DoFor <Fivegram>(text, strategy)); } return(nGrams); } catch { throw new Exception(MessageCollection.NGramTokenizer_TheRuleCantBeAppliedTo.Invoke(ruleName, text)); } }
private List <T> GetTokens <T>(ushort N, MatchCollection matches, ITokenizationStrategy strategy) { string[] allWords = ConvertToArray(matches); List <T> tokens = new List <T>(); for (uint i = 0; i < allWords.Length; i++) { // The last x NGrams are shorter in length... ushort currentN = N; if ((allWords.Length - i) < N) { currentN = (ushort)(allWords.Length - i); } // For N = 3: ["This", "is", "a"], ["is", "a", "sample"], ... string[] currentSubset = _ArrayManager.GetSubset(allWords, i, currentN); // [ "This", "is", "a" ] => [ "This", " ", "is", " ", "a" ] currentSubset = _ArrayManager.AddDelimiter(currentSubset, strategy.Delimiter); // [ "This", " ", "is", " ", "a" ] => "This is a" StringBuilder currentToken = new StringBuilder(); foreach (string word in currentSubset) { currentToken.Append(word); } // "This is a" => "this is a" if (strategy.ToLowercase) { currentToken = new StringBuilder(currentToken.ToString().ToLower()); } // new Monogram(strategy, "this is a") T nGram = CreateInstance <T>(strategy, currentToken.ToString()); tokens.Add(nGram); } return(tokens); }
// Fields // Properties // Constructors public Fourgram(ITokenizationStrategy strategy, string value) : base(4, strategy, value) { }
public List <INGram> Do(string text, ITokenizationStrategy strategy) => Do(text, strategy, new NGramTokenizerRuleSet());
// Fields // Properties // Constructors public Monogram(ITokenizationStrategy strategy, string value) : base(1, strategy, value) { }
// Fields // Properties // Constructors public Bigram(ITokenizationStrategy strategy, string value) : base(2, strategy, value) { }
// Fields // Properties // Constructors public FakeGram(ushort n, ITokenizationStrategy strategy, string value) : base(n, strategy, value) { }
// Fields // Properties // Constructors public Fivegram(ITokenizationStrategy strategy, string value) : base(5, strategy, value) { }
public LabeledExample Create(ulong id, string label, string text, ITokenizationStrategy strategy) => Create(id, label, text, strategy, new NGramTokenizerRuleSet());