Beispiel #1
0
        public void Initializate()
        {
            _vocabulary = new SemanticVocabulary();
            var standardTokenizer = new StandardTokenizer("the the the some some text");

            _vocabulary.AddSource(standardTokenizer);
        }
Beispiel #2
0
        public void SemanticVocabularySerialization()
        {
            var vocOrigin = new SemanticVocabulary();

            var tokenizer =
                new StandardTokenizer(
                    "In computer science, an inverted " +
                    "index (also referred to as postings file or inverted file) is an index data structure storing a mapping from content, " +
                    "such as words or numbers, to its locations in a database file, or in a document or a set of documents. " +
                    "The purpose of an inverted index is to allow fast full text searches, " +
                    "at a cost of increased processing when a document is added to the database. " +
                    "The inverted file may be the database file itself, rather than its index. " +
                    "It is the most popular data structure used in document retrieval systems,[1] " +
                    "used on a large scale for example in search engines. " +
                    "Several significant general-purpose mainframe-based database management systems have used " +
                    "inverted list architectures, including ADABAS, DATACOM/DB, and Model 204.");

            vocOrigin.AddSource(tokenizer);

            vocOrigin.TotalWords.Should().Be(130);
            vocOrigin.UniqueWords.Should().Be(79);

            vocOrigin.SaveToFile(VocabularyFileName);
            var vocDeser = SemanticVocabulary.LoadFromFile(VocabularyFileName);

            // test if the file exist
            var fileInfo = new FileInfo(VocabularyFileName);

            fileInfo.Exists.Should().BeTrue();

            vocOrigin.Equals(vocDeser).Should().BeFalse();
            vocDeser.TotalWords.Should().Be(vocOrigin.TotalWords);
            vocDeser.UniqueWords.Should().Be(vocOrigin.UniqueWords);
        }
Beispiel #3
0
        public void SemanticWeightWrongInput()
        {
            // the 3x, some 2x, text 1x
            var tokenizer  = new StandardTokenizer("the the the some some text");
            var vocabulary = new SemanticVocabulary();

            vocabulary.AddSource(tokenizer);

            vocabulary.GetSemanticWeight("   ");
        }
Beispiel #4
0
        public void SemantciWeghtNotPresentWord()
        {
            // the 3x, some 2x, text 1x
            var tokenizer  = new StandardTokenizer("the the the some some text");
            var vocabulary = new SemanticVocabulary();

            vocabulary.AddSource(tokenizer);

            vocabulary.GetSemanticWeight("NotPresent").Should().BeApproximately(1.79, 1e-2);
        }
Beispiel #5
0
        public void SemanticWeightsTest()
        {
            // the 3x, some 2x, text 1x
            var tokenizer  = new StandardTokenizer("the the the some some text");
            var vocabulary = new SemanticVocabulary();

            vocabulary.AddSource(tokenizer);

            var wThe  = vocabulary.GetSemanticWeight("THE");
            var wSome = vocabulary.GetSemanticWeight("SOME");
            var wText = vocabulary.GetSemanticWeight("TEXT");

            wThe.Should().BeLessThan(wSome);
            wSome.Should().BeLessThan(wText);
        }