示例#1
0
        public IncrementalBowSpace()
        {
            // configure tokenizer
            UnicodeTokenizer tokenizer = (UnicodeTokenizer)mTokenizer;

            tokenizer.Filter      = TokenizerFilter.AlphanumLoose;
            tokenizer.MinTokenLen = 2;
        }
示例#2
0
        private static async Task Query(string dir, string collectionName)
        {
            var tokenizer      = new UnicodeTokenizer();
            var qp             = new TermQueryParser();
            var sessionFactory = new SessionFactory(
                dir,
                tokenizer,
                new IniConfiguration(Path.Combine(Directory.GetCurrentDirectory(), "sir.ini")));

            while (true)
            {
                Console.Write("query>");

                var input = Console.ReadLine();

                if (string.IsNullOrWhiteSpace(input) || input == "q" || input == "quit")
                {
                    break;
                }

                var q = qp.Parse(collectionName.ToHash(), input, tokenizer);
                q.Skip = 0;
                q.Take = 100;

                using (var session = sessionFactory.CreateReadSession(collectionName, collectionName.ToHash()))
                {
                    var result = await session.Read(q);

                    var docs = result.Docs;

                    if (docs.Count > 0)
                    {
                        var index = 0;

                        foreach (var doc in docs.Take(10))
                        {
                            Console.WriteLine("{0} {1} {2}", index++, doc["___score"], doc["title"]);
                        }
                    }
                }
            }
        }
        public void FlushBlock()
        {
            if (inBody == 0)
            {
                if (inBody == 0 && Sharpen.Runtime.EqualsIgnoreCase("TITLE", lastStartTag))
                {
                    SetTitle(tokenBuilder.ToString().Trim());
                }
                textBuilder.Length  = 0;
                tokenBuilder.Length = 0;
                return;
            }

            int length = tokenBuilder.Length;

            if (length == 0)
            {
                return;
            }
            else if (length == 1)
            {
                if (sbLastWasWhitespace)
                {
                    textBuilder.Length  = 0;
                    tokenBuilder.Length = 0;
                    return;
                }
            }

            string[] tokens              = UnicodeTokenizer.Tokenize(tokenBuilder);
            int      numWords            = 0;
            int      numLinkedWords      = 0;
            int      numWrappedLines     = 0;
            int      currentLineLength   = -1;      // don't count the first space
            int      maxLineLength       = 80;
            int      numTokens           = 0;
            int      numWordsCurrentLine = 0;

            foreach (string token in tokens)
            {
                if (token == ANCHOR_TEXT_START)
                {
                    inAnchorText = true;
                }
                else
                {
                    if (token == ANCHOR_TEXT_END)
                    {
                        inAnchorText = false;
                    }
                    else
                    {
                        if (IsWord(token))
                        {
                            numTokens++;
                            numWords++;
                            numWordsCurrentLine++;

                            if (inAnchorText)
                            {
                                numLinkedWords++;
                            }
                            int tokenLength = token.Length;
                            currentLineLength += tokenLength + 1;
                            if (currentLineLength > maxLineLength)
                            {
                                numWrappedLines++;
                                currentLineLength   = tokenLength;
                                numWordsCurrentLine = 1;
                            }
                        }
                        else
                        {
                            numTokens++;
                        }
                    }
                }
            }
            if (numTokens == 0)
            {
                return;
            }
            int numWordsInWrappedLines;

            if (numWrappedLines == 0)
            {
                numWordsInWrappedLines = numWords;
                numWrappedLines        = 1;
            }
            else
            {
                numWordsInWrappedLines = numWords - numWordsCurrentLine;
            }
            TextBlock tb = new TextBlock(textBuilder.ToString().Trim(), currentContainedTextElements
                                         , numWords, numLinkedWords, numWordsInWrappedLines, numWrappedLines, offsetBlocks
                                         );

            currentContainedTextElements = new BitSet();
            offsetBlocks++;
            textBuilder.Length  = 0;
            tokenBuilder.Length = 0;
            tb.SetTagLevel(blockTagLevel);
            AddTextBlock(tb);
            blockTagLevel = -1;
        }
示例#4
0
        public override void Run(object[] args)
        {
            // prepare data
            IStemmer stemmer;

            Set <string> .ReadOnly stopWords;
            TextMiningUtils.GetLanguageTools(Language.English, out stopWords, out stemmer);

            // Create a tokenizer.
            var tokenizer = new UnicodeTokenizer
            {
                MinTokenLen = 2,                            // Each token must be at least 2 characters long.
                Filter      = TokenizerFilter.AlphaStrict   // Tokens can consist of alphabetic characters only.
            };

            // take data for two classes from cvs file
            var data = new List <LabeledTweet>(GetLabeledTweets().Where(lt => lt.Polarity != 2)).ToList();

            // Create a bag-of-words space.
            var bowSpc = new BowSpace
            {
                Tokenizer      = tokenizer,                 // Assign the tokenizer.
                StopWords      = stopWords,                 // Assign the stop words.
                Stemmer        = stemmer,                   // Assign the stemmer.
                MinWordFreq    = 1,                         // A term must appear at least n-times in the corpus for it to be part of the vocabulary.
                MaxNGramLen    = 2,                         // Terms consisting of at most n-consecutive words will be considered.
                WordWeightType = WordWeightType.TermFreq,   // Set the weighting scheme for the bag-of-words vectors to TF.
                //WordWeightType = WordWeightType.TfIdf,  // Set the weighting scheme for the bag-of-words vectors to TF-IDF.
                NormalizeVectors  = true,                   // The TF-IDF vectors will be normalized.
                CutLowWeightsPerc = 0                       // The terms with the lowest weights, summing up to 20% of the overall weight sum, will be removed from each TF-IDF vector.
            };
            ArrayList <SparseVector <double> > bowData = bowSpc.Initialize(data.Select(d => d.Text));

            // label data
            var labeledSet = new LabeledDataset <string, SparseVector <double> >();

            for (int i = 0; i < data.Count; i++)
            {
                labeledSet.Add(data[i].Label, bowData[i]);
            }
            labeledSet.Shuffle();

            int testSize    = labeledSet.Count / 10;
            var trainingSet = new LabeledDataset <string, SparseVector <double> >(labeledSet.Skip(testSize));
            var testSet     = new LabeledDataset <string, SparseVector <double> >(labeledSet.Take(testSize));

            //-------------------- SVM

            var svmBinClass = new SvmBinaryClassifier <string> {
                VerbosityLevel = SvmLightVerbosityLevel.Off
            };

            if (args.Any())
            {
                svmBinClass.C = (int)args[0];
            }
            //svmBinClass.BiasedHyperplane = true;
            //svmBinClass.CustomParams = "-t 3";   // non-linear kernel
            //svmBinClass.CustomParams = String.Format("-j {0}",j);

            svmBinClass.Train(trainingSet);

            int    correct = 0;
            double avgDist = 0;

            foreach (LabeledExample <string, SparseVector <double> > labeledExample in testSet)
            {
                var prediction = svmBinClass.Predict(labeledExample.Example);
                //Output.WriteLine("actual: {0}\tpredicted: {1}\t score: {2:0.0000}", labeledExample.Label, prediction.BestClassLabel, prediction.BestScore);
                avgDist += prediction.BestScore;
                if (prediction.BestClassLabel == labeledExample.Label)
                {
                    correct++;
                }
            }

            Output.WriteLine("Accuracy: {0:0.00}", 100.0 * correct / testSet.Count);
            Output.WriteLine("Avg. distance: {0:0.00}", avgDist / testSet.Count);

            Result.Add("accuracy", (double)correct / testSet.Count);

            Result.Add("classifier", svmBinClass);
            Result.Add("labeled_data", labeledSet);
        }
示例#5
0
        public void FlushBlock()
        {
            if (InBody == 0)
            {
                if ("TITLE".Equals(_lastStartTag, StringComparison.CurrentCultureIgnoreCase) && InBody == 0)
                {
                    Title = TokenBuffer.ToString().Trim();
                }
                _textBuffer.Length = 0;
                TokenBuffer.Length = 0;
                return;
            }

            int length = TokenBuffer.Length;

            switch (length)
            {
            case 0:
                return;

            case 1:
                if (SbLastWasWhitespace)
                {
                    _textBuffer.Length = 0;
                    TokenBuffer.Length = 0;
                    return;
                }
                break;
            }
            string[] tokens = UnicodeTokenizer.Tokenize(TokenBuffer);

            int       numWords            = 0;
            int       numLinkedWords      = 0;
            int       numWrappedLines     = 0;
            int       currentLineLength   = -1; // don't count the first space
            const int maxLineLength       = 80;
            int       numTokens           = 0;
            int       numWordsCurrentLine = 0;

            foreach (string token in tokens)
            {
                if (ANCHOR_TEXT_START.Equals(token))
                {
                    _inAnchorText = true;
                }
                else if (ANCHOR_TEXT_END.Equals(token))
                {
                    _inAnchorText = false;
                }
                else if (IsWord(token))
                {
                    numTokens++;
                    numWords++;
                    numWordsCurrentLine++;
                    if (_inAnchorText)
                    {
                        numLinkedWords++;
                    }
                    int tokenLength = token.Length;
                    currentLineLength += tokenLength + 1;
                    if (currentLineLength > maxLineLength)
                    {
                        numWrappedLines++;
                        currentLineLength   = tokenLength;
                        numWordsCurrentLine = 1;
                    }
                }
                else
                {
                    numTokens++;
                }
            }
            if (numTokens == 0)
            {
                return;
            }
            int numWordsInWrappedLines;

            if (numWrappedLines == 0)
            {
                numWordsInWrappedLines = numWords;
                numWrappedLines        = 1;
            }
            else
            {
                numWordsInWrappedLines = numWords - numWordsCurrentLine;
            }

            var tb = new TextBlock(
                _textBuffer.ToString().Trim(),
                _currentContainedTextElements,
                numWords,
                numLinkedWords,
                numWordsInWrappedLines,
                numWrappedLines,
                _offsetBlocks);

            _currentContainedTextElements = new BitArray(short.MaxValue);

            _offsetBlocks++;

            _textBuffer.Length = 0;
            TokenBuffer.Length = 0;

            tb.TagLevel = _blockTagLevel;
            AddTextBlock(tb);
            _blockTagLevel = -1;
        }
示例#6
0
        static void Main(string[] args)
        {
            // Get the stop words and stemmer for English.

            IStemmer stemmer;

            Set <string> .ReadOnly stopWords;
            TextMiningUtils.GetLanguageTools(Language.English,
                                             out stopWords, out stemmer);

            // Create a tokenizer.

            UnicodeTokenizer tokenizer = new UnicodeTokenizer();

            tokenizer.MinTokenLen = 2;                      // Each token must be at least 2
            // characters long.
            tokenizer.Filter = TokenizerFilter.AlphaStrict; // Tokens
            // can consist of alphabetic characters only.

            // Load a document corpus from a file. Each line in the file
            // represents one document.

            string[] docs
                = File.ReadAllLines("..\\..\\Data\\YahooFinance.txt");

            // Create a bag-of-words space.

            BowSpace bowSpc = new BowSpace();

            bowSpc.Tokenizer   = tokenizer; // Assign the tokenizer.
            bowSpc.StopWords   = stopWords; // Assign the stop words.
            bowSpc.Stemmer     = stemmer;   // Assign the stemmer.
            bowSpc.MinWordFreq = 3;         // A term must appear at least 3
            // times in the corpus for it to be part of the
            // vocabulary.
            bowSpc.MaxNGramLen = 3;                       // Terms consisting of at most 3
            // consecutive words will be considered.
            bowSpc.WordWeightType = WordWeightType.TfIdf; // Set the
            // weighting scheme for the bag-of-words vectors to
            // TF-IDF.
            bowSpc.NormalizeVectors = true; // The TF-IDF vectors will
            // be normalized.
            bowSpc.CutLowWeightsPerc = 0.2; // The terms with the lowest
            // weights, summing up to 20% of the overall weight sum,
            // will be removed from each TF-IDF vector.

            bowSpc.Initialize(docs); // Initialize the BOW space.

            // Compute 100 clusters of documents.

            KMeansFast kMeans = new KMeansFast(100); // Set k to 100.

            kMeans.Trials = 3;                       // Perform 3 repetitions. Take the best
            // result.
            kMeans.Eps = 0.001;                      // Stop iterating when the partition
            // quality increases for less than 0.001.

            ClusteringResult cr = kMeans.Cluster(bowSpc); // Execute.

            // Extract the top 5 terms with the highest TF-IDF weights
            // from each of the clusters' centroids and output the
            // number of documents (companies) in each cluster.

            foreach (Cluster cl in cr.Roots)
            {
                SparseVector <double> .ReadOnly centroid
                    = cl.ComputeCentroid(bowSpc, CentroidType.NrmL2);
                Console.Write(bowSpc.GetKeywordsStr(centroid, 5));
                Console.WriteLine(" ({0} companies)", cl.Items.Count);
            }

            // Output the documents that are contained in the first
            // cluster.

            foreach (int docIdx in cr.Roots[0].Items)
            {
                Console.WriteLine(docs[docIdx]);
            }
        }
示例#7
0
        static void Main(string[] args)
        {
            // Get the stop words and stemmer for English.

            IStemmer stemmer;

            Set <string> .ReadOnly stopWords;
            TextMiningUtils.GetLanguageTools(Language.English,
                                             out stopWords, out stemmer);

            // Test the stemmer.

            Console.WriteLine(stemmer.GetStem("running"));
            // Output: run

            // Create a tokenizer.

            UnicodeTokenizer tokenizer = new UnicodeTokenizer();

            tokenizer.MinTokenLen = 2;                      // Each token must be at least 2
            // characters long.
            tokenizer.Filter = TokenizerFilter.AlphaStrict; // Tokens
            // can consist of alphabetic characters only.

            // Test the tokenizer.

            tokenizer.Text = "one 1 two 2 three 3 one_1 two_2 three_3";
            foreach (string token in tokenizer)
            {
                Console.Write("\"{0}\" ", token);
            }
            Console.WriteLine();
            // Output: "one" "two" "three"

            // Load a document corpus from a file. Each line in the file
            // represents one document.

            string[] docs
                = File.ReadAllLines("..\\..\\Data\\YahooFinance.txt");

            // Create a bag-of-words space.

            BowSpace bowSpc = new BowSpace();

            bowSpc.Tokenizer   = tokenizer; // Assign the tokenizer.
            bowSpc.StopWords   = stopWords; // Assign the stop words.
            bowSpc.Stemmer     = stemmer;   // Assign the stemmer.
            bowSpc.MinWordFreq = 3;         // A term must appear at least 3
            // times in the corpus for it to be part of the
            // vocabulary.
            bowSpc.MaxNGramLen = 3;                       // Terms consisting of at most 3
            // consecutive words will be considered.
            bowSpc.WordWeightType = WordWeightType.TfIdf; // Set the
            // weighting scheme for the bag-of-words vectors to
            // TF-IDF.
            bowSpc.NormalizeVectors = true; // The TF-IDF vectors will
            // be normalized.
            bowSpc.CutLowWeightsPerc = 0.2; // The terms with the lowest
            // weights, summing up to 20% of the overall weight sum,
            // will be removed from each TF-IDF vector.

            bowSpc.Initialize(docs); // Initialize the BOW space.

            // Output the vocabulary (the terms, their stems,
            // frequencies, and document frequencies) to the console.

            StreamWriter stdOut
                = new StreamWriter(Console.OpenStandardOutput());

            bowSpc.OutputStats(stdOut);
            stdOut.Close();

            // Output the TF-IDF vector representing the description of
            // Google to the console.

            SparseVector <double> .ReadOnly googVec
                = bowSpc.BowVectors[4192 - 1]; // The description of
            // Google can be found at the row 4192 in the corpus.
            foreach (IdxDat <double> termInfo in googVec)
            {
                Console.WriteLine("{0} : {1}",
                                  bowSpc.Words[termInfo.Idx].MostFrequentForm,
                                  termInfo.Dat);
            }

            // Extract the top 5 terms with the highest TF-IDF weights
            // from the vector representing Google.

            Console.WriteLine(bowSpc.GetKeywordsStr(googVec, 5));
            // Output: google, relevant, targeted advertising, search,
            // index
        }