public void Indexes_Build_Time() { const int buildTimes = 3; var stopWatch = new Stopwatch(); var tokenizer = new DefaultTokenizer(); var lines = TestDataGenerator.GetRandomLines(50000, 20000); stopWatch.Start(); for (var i = 0; i < buildTimes; i++) { BuildIndex(new InvertedHashIndex(tokenizer), lines); } var hashIndexBuildTime = stopWatch.Elapsed; stopWatch.Restart(); for (var i = 0; i < buildTimes; i++) { BuildIndex(new InvertedIndex(tokenizer), lines); } var suffixIndeBuildTime = stopWatch.Elapsed; Console.WriteLine($"HashIndex build time: {hashIndexBuildTime}"); Console.WriteLine($"SuffixIndex build time: {suffixIndeBuildTime}"); }
public void TestConstructors() { ITokenizer tok = null; try { tok = new DefaultTokenizer(null); Assert.Fail("Shouldn't be able to set a tokenizer of null."); } catch { Assert.IsTrue(true); } tok = new DefaultTokenizer(string.Empty); tok = new DefaultTokenizer(DefaultTokenizer.BREAK_ON_WHITESPACE); tok = new DefaultTokenizer(DefaultTokenizer.BREAK_ON_WORD_BREAKS); try { tok = new DefaultTokenizer(43); Assert.Fail("Shouldn't be able to set a tokenizer of type 43."); } catch { Assert.IsTrue(true); } }
public void Detokenize() { var Tokenizer = new DefaultTokenizer(new ITokenizerLanguage[] { new EnglishLanguage(new IEnglishTokenFinder[] { new Word(), new Whitespace(), new Symbol() }) }); var Result = Tokenizer.Tokenize("This is a test.", TokenizerLanguage.EnglishRuleBased); Assert.Equal("This is a test.", Tokenizer.Detokenize(Result, TokenizerLanguage.EnglishRuleBased)); }
public void TestTokenize() { ITokenizer tok = null; string[] words = null; tok = new DefaultTokenizer(DefaultTokenizer.BREAK_ON_WHITESPACE); words = tok.Tokenize("My very,new string!"); Assert.AreEqual(3, words.Length); Assert.AreEqual("My", words[0]); Assert.AreEqual("very,new", words[1]); Assert.AreEqual("string!", words[2]); tok = new DefaultTokenizer(DefaultTokenizer.BREAK_ON_WORD_BREAKS); words = tok.Tokenize("My very,new-string!and/more(NIO)peter's 1.4"); Assert.AreEqual(11, words.Length); Assert.AreEqual("My", words[0]); Assert.AreEqual("very", words[1]); Assert.AreEqual("new", words[2]); Assert.AreEqual("string", words[3]); Assert.AreEqual("and", words[4]); Assert.AreEqual("more", words[5]); Assert.AreEqual("NIO", words[6]); //TODO shouldn't this be "peter's" instead of "peter" and "s"? Assert.AreEqual("peter", words[7]); Assert.AreEqual("s", words[8]); //TODO shouldn't this be "1.4" instead of "1" and "4"? Assert.AreEqual("1", words[9]); Assert.AreEqual("4", words[10]); }
public void TestGetUniqueWords() { string[] result = Utilities.GetUniqueWords(null); Assert.IsNotNull(result); Assert.AreEqual(0, result.Length); string[] input = new string[] { "one", "one", "one", "two", "three" }; string[] expectedResult = new string[] { "one", "three", "two" }; result = Utilities.GetUniqueWords(input); Assert.IsNotNull(result); Assert.AreEqual(expectedResult.Length, result.Length); Array.Sort(expectedResult); Array.Sort(result); for (int i = 0; i < expectedResult.Length; i++) { Assert.AreEqual(expectedResult[i], result[i]); } string[] words = new DefaultTokenizer().Tokenize(sentence.ToLower()); result = Utilities.GetUniqueWords(words); Assert.AreEqual(5, result.Length); }
public static ITokenizer Create(Tokenizer tokenizer) { DefaultTokenizer defaultTokenizer = new DefaultTokenizer(); defaultTokenizer.Initialize(tokenizer.Parameters); return(defaultTokenizer); }
public void TestGetTokenizer() { SimpleWordsDataSource wds = new SimpleWordsDataSource(); ITokenizer tokenizer = new DefaultTokenizer(DefaultTokenizer.BREAK_ON_WORD_BREAKS); BayesianClassifier classifier = new BayesianClassifier(wds, tokenizer); Assert.AreEqual(tokenizer, classifier.Tokenizer); }
public void Tag() { var TestObject = new BrillTagger(); var Tokenizer = new DefaultTokenizer(new[] { new EnglishLanguage(new IEnglishTokenFinder[] { new Word(), new Whitespace(), new Symbol() }) }); var Results = TestObject.Tag(Tokenizer.Tokenize("I would go buy a computer.", TokenizerLanguage.EnglishRuleBased)); Assert.Equal("NN VM VVB NN RR NN", Results.Where(x => x.TokenType == TokenType.Word).ToString(x => x.PartOfSpeech, " ")); }
public void Return_Empty_List_For_Null_Or_WhiteSpace_String(string s) { var tokenizer = new DefaultTokenizer(); var tokens = tokenizer.GetTokens(s); tokens.Should().BeEmpty(); }
public void Tag() { var TestObject = new DefaultTagger(new[] { new SimpleTagger(Canister.Builder.Bootstrapper.Resolve <IInflector>(), Canister.Builder.Bootstrapper.Resolve <ISynonymFinder>()) }); var Normalizer = new DefaultNormalizer(new INormalizer[] { new ASCIIFolder(ObjectPool), new LowerCase() }, new ITextNormalizer[] { new HTMLToText(ObjectPool) }); var Tokenizer = new DefaultTokenizer(new[] { new EnglishLanguage(new IEnglishTokenFinder[] { new Word(), new Whitespace(), new Symbol() }) }, ObjectPool); var Results = TestObject.Tag(Normalizer.Normalize(Tokenizer.Tokenize(Normalizer.Normalize("I would go buy a computer."), TokenizerLanguage.EnglishRuleBased)), POSTaggerLanguage.BrillTagger); Assert.Equal("NNP MD VB VB DT NN", Results.Where(x => x.TokenType == TokenType.Word).ToString(x => x.PartOfSpeech, " ")); }
public void TagProperNoun() { var TestObject = new SimpleTagger(Canister.Builder.Bootstrapper.Resolve <IInflector>(), Canister.Builder.Bootstrapper.Resolve <ISynonymFinder>()); var Normalizer = new DefaultNormalizer(new INormalizer[] { new ASCIIFolder(ObjectPool), new LowerCase() }, new ITextNormalizer[] { new HTMLToText(ObjectPool) }); var Tokenizer = new DefaultTokenizer(new[] { new EnglishLanguage(new IEnglishTokenFinder[] { new Word(), new Whitespace(), new Symbol() }) }, ObjectPool); var Results = TestObject.Tag(Normalizer.Normalize(Tokenizer.Tokenize(Normalizer.Normalize("I want to go to New York City."), TokenizerLanguage.EnglishRuleBased))); Assert.Equal("NNP VBP TO VB TO NNP NNP NNP", Results.Where(x => x.TokenType == TokenType.Word).ToString(x => x.PartOfSpeech, " ")); }
public void TestGetStopWordProvider() { var wds = new SimpleWordsDataSource(); ITokenizer tokenizer = new DefaultTokenizer(DefaultTokenizer.BREAK_ON_WORD_BREAKS); IStopWordProvider stopWordProvider = new DefaultStopWordProvider(); var classifier = new BayesianClassifier(wds, tokenizer, stopWordProvider); Assert.AreEqual(stopWordProvider, classifier.StopWordProvider); }
public void Detect() { string Text = "\"Darkness cannot drive out darkness: only light can do that. Hate cannot drive out hate: only love can do that.\""; var Tokenizer = new DefaultTokenizer(new[] { new EnglishLanguage(new IEnglishTokenFinder[] { new Word(), new Whitespace(), new Symbol() }) }, ObjectPool); var Results = new DefaultDetector().DetectSentences(Tokenizer.Tokenize(Text, TokenizerLanguage.EnglishRuleBased)); Assert.Equal(2, Results.Length); Assert.Equal("Darkness cannot drive out darkness: only light can do that.", Results[0].ToString()); Assert.Equal("Hate cannot drive out hate: only love can do that.", Results[1].ToString()); }
public void TokenizeNull() { var Result = new DefaultTokenizer(new ITokenizerLanguage[] { new EnglishLanguage(new IEnglishTokenFinder[] { new Word(), new Whitespace(), new Symbol() }) }).Tokenize(null, TokenizerLanguage.EnglishRuleBased); Assert.Single(Result); Assert.Equal(0, Result[0].EndPosition); Assert.Equal(0, Result[0].StartPosition); Assert.Equal(TokenType.EOF, Result[0].TokenType); Assert.Equal(string.Empty, Result[0].Value); }
public void Terms_Converted_To_Lower_Case() { const string s = "TeSt"; var tokenizer = new DefaultTokenizer(); var tokens = tokenizer.GetTokens(s); tokens.Count.Should().Be(1); tokens.Should().BeEquivalentTo(new Token { Term = "test", Position = 1, DistanceToNext = 0 }); }
public void Tokenize() { var Result = new DefaultTokenizer(new ITokenizerLanguage[] { new EnglishLanguage(new IEnglishTokenFinder[] { new Word(), new Whitespace(), new Symbol() }) }).Tokenize("This is a test.", TokenizerLanguage.EnglishRuleBased); Assert.Equal(9, Result.Length); Assert.Equal(3, Result[0].EndPosition); Assert.Equal(0, Result[0].StartPosition); Assert.Equal(TokenType.Word, Result[0].TokenType); Assert.Equal("This", Result[0].Value); Assert.Equal(4, Result[1].EndPosition); Assert.Equal(4, Result[1].StartPosition); Assert.Equal(TokenType.WhiteSpace, Result[1].TokenType); Assert.Equal(" ", Result[1].Value); Assert.Equal(6, Result[2].EndPosition); Assert.Equal(5, Result[2].StartPosition); Assert.Equal(TokenType.Word, Result[2].TokenType); Assert.Equal("is", Result[2].Value); Assert.Equal(7, Result[3].EndPosition); Assert.Equal(7, Result[3].StartPosition); Assert.Equal(TokenType.WhiteSpace, Result[3].TokenType); Assert.Equal(" ", Result[3].Value); Assert.Equal(8, Result[4].EndPosition); Assert.Equal(8, Result[4].StartPosition); Assert.Equal(TokenType.Word, Result[4].TokenType); Assert.Equal("a", Result[4].Value); Assert.Equal(9, Result[5].EndPosition); Assert.Equal(9, Result[5].StartPosition); Assert.Equal(TokenType.WhiteSpace, Result[5].TokenType); Assert.Equal(" ", Result[5].Value); Assert.Equal(13, Result[6].EndPosition); Assert.Equal(10, Result[6].StartPosition); Assert.Equal(TokenType.Word, Result[6].TokenType); Assert.Equal("test", Result[6].Value); Assert.Equal(14, Result[7].EndPosition); Assert.Equal(14, Result[7].StartPosition); Assert.Equal(TokenType.Period, Result[7].TokenType); Assert.Equal(".", Result[7].Value); Assert.Equal(15, Result[8].EndPosition); Assert.Equal(15, Result[8].StartPosition); Assert.Equal(TokenType.EOF, Result[8].TokenType); Assert.Equal(string.Empty, Result[8].Value); }
public void BasicTest() { var EntityFinder = new DefaultEntityFinder(new IFinder[] { new DefaultFinder() }); var TestObject = new DefaultTagger(new[] { new SimpleTagger(Canister.Builder.Bootstrapper.Resolve <IInflector>(), Canister.Builder.Bootstrapper.Resolve <ISynonymFinder>()) }); var Normalizer = new DefaultNormalizer(new INormalizer[] { new ASCIIFolder(ObjectPool), new LowerCase() }, new ITextNormalizer[] { new HTMLToText(ObjectPool) }); var Tokenizer = new DefaultTokenizer(new[] { new EnglishLanguage(new IEnglishTokenFinder[] { new Word(), new Whitespace(), new Symbol() }) }, ObjectPool); var Results = EntityFinder.Find(TestObject.Tag(Normalizer.Normalize(Tokenizer.Tokenize(Normalizer.Normalize("I wish G.M. made slightly better products."), TokenizerLanguage.EnglishRuleBased)), POSTaggerLanguage.BrillTagger), EntityFinderLanguage.DefaultFinder); Assert.True(Results[0].Entity); Assert.False(Results[1].Entity); Assert.False(Results[2].Entity); Assert.False(Results[3].Entity); Assert.True(Results[4].Entity); Assert.False(Results[5].Entity); }
public void Get_Tokens_With_Additional_Spaces() { const string s = " et ! as "; var tokenizer = new DefaultTokenizer(); var tokens = tokenizer.GetTokens(s); tokens.Count.Should().Be(3); tokens[0].Should().BeEquivalentTo(new Token { Term = "et", Position = 3, DistanceToNext = 4 }); tokens[1].Should().BeEquivalentTo(new Token { Term = "!", Position = 7, DistanceToNext = 4 }); tokens[2].Should().BeEquivalentTo(new Token { Term = "as", Position = 11, DistanceToNext = 0 }); }
public void Search_Result_Is_Same_For_InvertedIndex_And_SimpleSearching() { var tokenizer = new DefaultTokenizer(); var ticks = Environment.TickCount; Console.WriteLine($"Test ticks: {ticks}"); var lines = TestDataGenerator.GetRandomLines(ticks); var invertedIndex = new InvertedIndex(tokenizer); var phrase = TestDataGenerator.GetSearchPhrase(ticks); BuildIndex(invertedIndex, lines); var inmemoryResult = InmemorySimpleSearch.Find(lines, phrase); invertedIndex.Find(phrase) .Select(x => new { x.RowNumber, x.ColNumber }) .Should() .BeEquivalentTo(inmemoryResult.Select(x => new { x.RowNumber, x.ColNumber })); }
public void InvertedIndex_Should_Be_Faster_Than_Simple_Searching() { const int phrasesCount = 50; var phrases = new string[phrasesCount]; var tickCount = Environment.TickCount; Console.WriteLine($"TickCount: {tickCount}"); for (var i = 0; i < phrasesCount; i++) { phrases[i] = TestDataGenerator.GetSearchPhrase(tickCount + i); } var tokenizer = new DefaultTokenizer(); var stopWatch = new Stopwatch(); var lines = TestDataGenerator.GetRandomLines(tickCount, 50000); var invertedIndex = new InvertedIndex(tokenizer); BuildIndex(invertedIndex, lines); stopWatch.Start(); for (var i = 0; i < phrasesCount; i++) { var elapsedBefore = stopWatch.Elapsed; invertedIndex.Find(phrases[i]); var elapsed = stopWatch.Elapsed - elapsedBefore; Console.WriteLine($"Elapsed for phrase: {phrases[i]} {elapsed}"); } var indexSearchingTime = stopWatch.Elapsed; stopWatch.Restart(); for (var i = 0; i < phrasesCount; i++) { InmemorySimpleSearch.Find(lines, phrases[i]); } var simpleSearchingTime = stopWatch.Elapsed; Console.WriteLine($"InvertedIndex searching time: {indexSearchingTime}"); Console.WriteLine($"Simple searching time: {simpleSearchingTime}"); indexSearchingTime.Should().BeLessThan(simpleSearchingTime); }
public HashSet <int> ListWords(string content) { HashSet <int> result = new HashSet <int>(); var tokenizer = new DefaultTokenizer(); tokenizer.SetDoc(content); var token = tokenizer.ConsumeNext(); while (token != null) { var wordid = WordIndex.GetOrAddWord(token.Value); if (wordid >= 0) { result.Add(wordid); } token = tokenizer.ConsumeNext(); } return(result); }
public HashSet <int> ReadWords(string keywords) { HashSet <int> result = new HashSet <int>(); var tokenizer = new DefaultTokenizer(); tokenizer.SetDoc(keywords); var token = tokenizer.ConsumeNext(); while (token != null) { var wordid = WordIndex.GetWord(token.Value); if (wordid != -1) { result.Add(wordid); } token = tokenizer.ConsumeNext(); } return(result); }
public static ITokenizer Create(Tokenizer tokenizerConfig) { ITokenizer tokenizer; switch (tokenizerConfig.Type) { case "None": { tokenizer = new NoneTokenizer(); break; } default: { tokenizer = new DefaultTokenizer(); break; } } tokenizer.Initialize(tokenizerConfig.Parameters); return(tokenizer); }
public void Get_Tokens_With_Dots_And_Brackets() { const string s = ". m.Get(new); "; var tokenizer = new DefaultTokenizer(); var tokens = tokenizer.GetTokens(s); tokens.Count.Should().Be(4); tokens[0].Should().BeEquivalentTo(new Token { Term = ".", Position = 1, DistanceToNext = 2 }); tokens[1].Should().BeEquivalentTo(new Token { Term = "m", Position = 3, DistanceToNext = 1 }); tokens[2].Should().BeEquivalentTo(new Token { Term = ".get", Position = 4, DistanceToNext = 4 }); tokens[3].Should().BeEquivalentTo(new Token { Term = "(new);", Position = 8, DistanceToNext = 0 }); }
public void Get_Tokens_With_Single_Spaces() { const string s = "it a test phrase"; var tokenizer = new DefaultTokenizer(); var tokens = tokenizer.GetTokens(s); tokens.Count.Should().Be(4); tokens[0].Should().BeEquivalentTo(new Token { Term = "it", Position = 1, DistanceToNext = 3 }); tokens[1].Should().BeEquivalentTo(new Token { Term = "a", Position = 4, DistanceToNext = 2 }); tokens[2].Should().BeEquivalentTo(new Token { Term = "test", Position = 6, DistanceToNext = 5 }); tokens[3].Should().BeEquivalentTo(new Token { Term = "phrase", Position = 11, DistanceToNext = 0 }); }
public void Detect() { string Text = @"That can I; At least, the whisper goes so. Our last king, Whose image even but now appear'd to us, Was, as you know, by Fortinbras of Norway, Thereto prick'd on by a most emulate pride, Dared to the combat; in which our valiant Hamlet-- For so this side of our known world esteem'd him-- Did slay this Fortinbras; who by a seal'd compact, Well ratified by law and heraldry, Did forfeit, with his life, all those his lands Which he stood seized of, to the conqueror: Against the which, a moiety competent Was gaged by our king; which had return'd To the inheritance of Fortinbras, Had he been vanquisher; as, by the same covenant, And carriage of the article design'd, His fell to Hamlet. Now, sir, young Fortinbras, Of unimproved mettle hot and full, Hath in the skirts of Norway here and there Shark'd up a list of lawless resolutes, For food and diet, to some enterprise That hath a stomach in't; which is no other-- As it doth well appear unto our state-- But to recover of us, by strong hand And terms compulsatory, those foresaid lands So by his father lost: and this, I take it, Is the main motive of our preparations, The source of this our watch and the chief head Of this post-haste and romage in the land."; var Tokenizer = new DefaultTokenizer(new[] { new EnglishLanguage(new IEnglishTokenFinder[] { new Word(), new Whitespace(), new Symbol(), new NewLine() }) }, ObjectPool); var Results = new NewLineDetector().DetectSentences(Tokenizer.Tokenize(Text, TokenizerLanguage.EnglishRuleBased)); Assert.Equal(29, Results.Length); }
public void TokenizeLongerText() { var Result = new DefaultTokenizer(new ITokenizerLanguage[] { new EnglishLanguage(new IEnglishTokenFinder[] { new Word(), new Whitespace(), new Symbol() }) }).Tokenize(@"""I said, 'what're you? Crazy?'"" said Sandowsky. ""I can't afford to do that.""", TokenizerLanguage.EnglishRuleBased); Assert.Equal(37, Result.Length); }
public void TestGetUniqueWords() { string[] result = Utilities.GetUniqueWords(null); Assert.IsNotNull(result); Assert.AreEqual(0, result.Length); string[] input = new string[] { "one", "one", "one", "two", "three" }; string[] expectedResult = new string[] { "one", "three", "two" }; result = Utilities.GetUniqueWords(input); Assert.IsNotNull(result); Assert.AreEqual(expectedResult.Length, result.Length); Array.Sort(expectedResult); Array.Sort(result); for (int i = 0; i < expectedResult.Length; i++) Assert.AreEqual(expectedResult[i], result[i]); string[] words = new DefaultTokenizer().Tokenize(sentence.ToLower()); result = Utilities.GetUniqueWords(words); Assert.AreEqual(5, result.Length); }