public IncrementalBowSpace() { // configure tokenizer UnicodeTokenizer tokenizer = (UnicodeTokenizer)mTokenizer; tokenizer.Filter = TokenizerFilter.AlphanumLoose; tokenizer.MinTokenLen = 2; }
private static async Task Query(string dir, string collectionName) { var tokenizer = new UnicodeTokenizer(); var qp = new TermQueryParser(); var sessionFactory = new SessionFactory( dir, tokenizer, new IniConfiguration(Path.Combine(Directory.GetCurrentDirectory(), "sir.ini"))); while (true) { Console.Write("query>"); var input = Console.ReadLine(); if (string.IsNullOrWhiteSpace(input) || input == "q" || input == "quit") { break; } var q = qp.Parse(collectionName.ToHash(), input, tokenizer); q.Skip = 0; q.Take = 100; using (var session = sessionFactory.CreateReadSession(collectionName, collectionName.ToHash())) { var result = await session.Read(q); var docs = result.Docs; if (docs.Count > 0) { var index = 0; foreach (var doc in docs.Take(10)) { Console.WriteLine("{0} {1} {2}", index++, doc["___score"], doc["title"]); } } } } }
public void FlushBlock() { if (inBody == 0) { if (inBody == 0 && Sharpen.Runtime.EqualsIgnoreCase("TITLE", lastStartTag)) { SetTitle(tokenBuilder.ToString().Trim()); } textBuilder.Length = 0; tokenBuilder.Length = 0; return; } int length = tokenBuilder.Length; if (length == 0) { return; } else if (length == 1) { if (sbLastWasWhitespace) { textBuilder.Length = 0; tokenBuilder.Length = 0; return; } } string[] tokens = UnicodeTokenizer.Tokenize(tokenBuilder); int numWords = 0; int numLinkedWords = 0; int numWrappedLines = 0; int currentLineLength = -1; // don't count the first space int maxLineLength = 80; int numTokens = 0; int numWordsCurrentLine = 0; foreach (string token in tokens) { if (token == ANCHOR_TEXT_START) { inAnchorText = true; } else { if (token == ANCHOR_TEXT_END) { inAnchorText = false; } else { if (IsWord(token)) { numTokens++; numWords++; numWordsCurrentLine++; if (inAnchorText) { numLinkedWords++; } int tokenLength = token.Length; currentLineLength += tokenLength + 1; if (currentLineLength > maxLineLength) { numWrappedLines++; currentLineLength = tokenLength; numWordsCurrentLine = 1; } } else { numTokens++; } } } } if (numTokens == 0) { return; } int numWordsInWrappedLines; if (numWrappedLines == 0) { numWordsInWrappedLines = numWords; numWrappedLines = 1; } else { numWordsInWrappedLines = numWords - numWordsCurrentLine; } TextBlock tb = new TextBlock(textBuilder.ToString().Trim(), currentContainedTextElements , numWords, numLinkedWords, numWordsInWrappedLines, numWrappedLines, offsetBlocks ); currentContainedTextElements = new BitSet(); offsetBlocks++; textBuilder.Length = 0; tokenBuilder.Length = 0; tb.SetTagLevel(blockTagLevel); AddTextBlock(tb); blockTagLevel = -1; }
public override void Run(object[] args) { // prepare data IStemmer stemmer; Set <string> .ReadOnly stopWords; TextMiningUtils.GetLanguageTools(Language.English, out stopWords, out stemmer); // Create a tokenizer. var tokenizer = new UnicodeTokenizer { MinTokenLen = 2, // Each token must be at least 2 characters long. Filter = TokenizerFilter.AlphaStrict // Tokens can consist of alphabetic characters only. }; // take data for two classes from cvs file var data = new List <LabeledTweet>(GetLabeledTweets().Where(lt => lt.Polarity != 2)).ToList(); // Create a bag-of-words space. var bowSpc = new BowSpace { Tokenizer = tokenizer, // Assign the tokenizer. StopWords = stopWords, // Assign the stop words. Stemmer = stemmer, // Assign the stemmer. MinWordFreq = 1, // A term must appear at least n-times in the corpus for it to be part of the vocabulary. MaxNGramLen = 2, // Terms consisting of at most n-consecutive words will be considered. WordWeightType = WordWeightType.TermFreq, // Set the weighting scheme for the bag-of-words vectors to TF. //WordWeightType = WordWeightType.TfIdf, // Set the weighting scheme for the bag-of-words vectors to TF-IDF. NormalizeVectors = true, // The TF-IDF vectors will be normalized. CutLowWeightsPerc = 0 // The terms with the lowest weights, summing up to 20% of the overall weight sum, will be removed from each TF-IDF vector. }; ArrayList <SparseVector <double> > bowData = bowSpc.Initialize(data.Select(d => d.Text)); // label data var labeledSet = new LabeledDataset <string, SparseVector <double> >(); for (int i = 0; i < data.Count; i++) { labeledSet.Add(data[i].Label, bowData[i]); } labeledSet.Shuffle(); int testSize = labeledSet.Count / 10; var trainingSet = new LabeledDataset <string, SparseVector <double> >(labeledSet.Skip(testSize)); var testSet = new LabeledDataset <string, SparseVector <double> >(labeledSet.Take(testSize)); //-------------------- SVM var svmBinClass = new SvmBinaryClassifier <string> { VerbosityLevel = SvmLightVerbosityLevel.Off }; if (args.Any()) { svmBinClass.C = (int)args[0]; } //svmBinClass.BiasedHyperplane = true; //svmBinClass.CustomParams = "-t 3"; // non-linear kernel //svmBinClass.CustomParams = String.Format("-j {0}",j); svmBinClass.Train(trainingSet); int correct = 0; double avgDist = 0; foreach (LabeledExample <string, SparseVector <double> > labeledExample in testSet) { var prediction = svmBinClass.Predict(labeledExample.Example); //Output.WriteLine("actual: {0}\tpredicted: {1}\t score: {2:0.0000}", labeledExample.Label, prediction.BestClassLabel, prediction.BestScore); avgDist += prediction.BestScore; if (prediction.BestClassLabel == labeledExample.Label) { correct++; } } Output.WriteLine("Accuracy: {0:0.00}", 100.0 * correct / testSet.Count); Output.WriteLine("Avg. distance: {0:0.00}", avgDist / testSet.Count); Result.Add("accuracy", (double)correct / testSet.Count); Result.Add("classifier", svmBinClass); Result.Add("labeled_data", labeledSet); }
public void FlushBlock() { if (InBody == 0) { if ("TITLE".Equals(_lastStartTag, StringComparison.CurrentCultureIgnoreCase) && InBody == 0) { Title = TokenBuffer.ToString().Trim(); } _textBuffer.Length = 0; TokenBuffer.Length = 0; return; } int length = TokenBuffer.Length; switch (length) { case 0: return; case 1: if (SbLastWasWhitespace) { _textBuffer.Length = 0; TokenBuffer.Length = 0; return; } break; } string[] tokens = UnicodeTokenizer.Tokenize(TokenBuffer); int numWords = 0; int numLinkedWords = 0; int numWrappedLines = 0; int currentLineLength = -1; // don't count the first space const int maxLineLength = 80; int numTokens = 0; int numWordsCurrentLine = 0; foreach (string token in tokens) { if (ANCHOR_TEXT_START.Equals(token)) { _inAnchorText = true; } else if (ANCHOR_TEXT_END.Equals(token)) { _inAnchorText = false; } else if (IsWord(token)) { numTokens++; numWords++; numWordsCurrentLine++; if (_inAnchorText) { numLinkedWords++; } int tokenLength = token.Length; currentLineLength += tokenLength + 1; if (currentLineLength > maxLineLength) { numWrappedLines++; currentLineLength = tokenLength; numWordsCurrentLine = 1; } } else { numTokens++; } } if (numTokens == 0) { return; } int numWordsInWrappedLines; if (numWrappedLines == 0) { numWordsInWrappedLines = numWords; numWrappedLines = 1; } else { numWordsInWrappedLines = numWords - numWordsCurrentLine; } var tb = new TextBlock( _textBuffer.ToString().Trim(), _currentContainedTextElements, numWords, numLinkedWords, numWordsInWrappedLines, numWrappedLines, _offsetBlocks); _currentContainedTextElements = new BitArray(short.MaxValue); _offsetBlocks++; _textBuffer.Length = 0; TokenBuffer.Length = 0; tb.TagLevel = _blockTagLevel; AddTextBlock(tb); _blockTagLevel = -1; }
static void Main(string[] args) { // Get the stop words and stemmer for English. IStemmer stemmer; Set <string> .ReadOnly stopWords; TextMiningUtils.GetLanguageTools(Language.English, out stopWords, out stemmer); // Create a tokenizer. UnicodeTokenizer tokenizer = new UnicodeTokenizer(); tokenizer.MinTokenLen = 2; // Each token must be at least 2 // characters long. tokenizer.Filter = TokenizerFilter.AlphaStrict; // Tokens // can consist of alphabetic characters only. // Load a document corpus from a file. Each line in the file // represents one document. string[] docs = File.ReadAllLines("..\\..\\Data\\YahooFinance.txt"); // Create a bag-of-words space. BowSpace bowSpc = new BowSpace(); bowSpc.Tokenizer = tokenizer; // Assign the tokenizer. bowSpc.StopWords = stopWords; // Assign the stop words. bowSpc.Stemmer = stemmer; // Assign the stemmer. bowSpc.MinWordFreq = 3; // A term must appear at least 3 // times in the corpus for it to be part of the // vocabulary. bowSpc.MaxNGramLen = 3; // Terms consisting of at most 3 // consecutive words will be considered. bowSpc.WordWeightType = WordWeightType.TfIdf; // Set the // weighting scheme for the bag-of-words vectors to // TF-IDF. bowSpc.NormalizeVectors = true; // The TF-IDF vectors will // be normalized. bowSpc.CutLowWeightsPerc = 0.2; // The terms with the lowest // weights, summing up to 20% of the overall weight sum, // will be removed from each TF-IDF vector. bowSpc.Initialize(docs); // Initialize the BOW space. // Compute 100 clusters of documents. KMeansFast kMeans = new KMeansFast(100); // Set k to 100. kMeans.Trials = 3; // Perform 3 repetitions. Take the best // result. kMeans.Eps = 0.001; // Stop iterating when the partition // quality increases for less than 0.001. ClusteringResult cr = kMeans.Cluster(bowSpc); // Execute. // Extract the top 5 terms with the highest TF-IDF weights // from each of the clusters' centroids and output the // number of documents (companies) in each cluster. foreach (Cluster cl in cr.Roots) { SparseVector <double> .ReadOnly centroid = cl.ComputeCentroid(bowSpc, CentroidType.NrmL2); Console.Write(bowSpc.GetKeywordsStr(centroid, 5)); Console.WriteLine(" ({0} companies)", cl.Items.Count); } // Output the documents that are contained in the first // cluster. foreach (int docIdx in cr.Roots[0].Items) { Console.WriteLine(docs[docIdx]); } }
static void Main(string[] args) { // Get the stop words and stemmer for English. IStemmer stemmer; Set <string> .ReadOnly stopWords; TextMiningUtils.GetLanguageTools(Language.English, out stopWords, out stemmer); // Test the stemmer. Console.WriteLine(stemmer.GetStem("running")); // Output: run // Create a tokenizer. UnicodeTokenizer tokenizer = new UnicodeTokenizer(); tokenizer.MinTokenLen = 2; // Each token must be at least 2 // characters long. tokenizer.Filter = TokenizerFilter.AlphaStrict; // Tokens // can consist of alphabetic characters only. // Test the tokenizer. tokenizer.Text = "one 1 two 2 three 3 one_1 two_2 three_3"; foreach (string token in tokenizer) { Console.Write("\"{0}\" ", token); } Console.WriteLine(); // Output: "one" "two" "three" // Load a document corpus from a file. Each line in the file // represents one document. string[] docs = File.ReadAllLines("..\\..\\Data\\YahooFinance.txt"); // Create a bag-of-words space. BowSpace bowSpc = new BowSpace(); bowSpc.Tokenizer = tokenizer; // Assign the tokenizer. bowSpc.StopWords = stopWords; // Assign the stop words. bowSpc.Stemmer = stemmer; // Assign the stemmer. bowSpc.MinWordFreq = 3; // A term must appear at least 3 // times in the corpus for it to be part of the // vocabulary. bowSpc.MaxNGramLen = 3; // Terms consisting of at most 3 // consecutive words will be considered. bowSpc.WordWeightType = WordWeightType.TfIdf; // Set the // weighting scheme for the bag-of-words vectors to // TF-IDF. bowSpc.NormalizeVectors = true; // The TF-IDF vectors will // be normalized. bowSpc.CutLowWeightsPerc = 0.2; // The terms with the lowest // weights, summing up to 20% of the overall weight sum, // will be removed from each TF-IDF vector. bowSpc.Initialize(docs); // Initialize the BOW space. // Output the vocabulary (the terms, their stems, // frequencies, and document frequencies) to the console. StreamWriter stdOut = new StreamWriter(Console.OpenStandardOutput()); bowSpc.OutputStats(stdOut); stdOut.Close(); // Output the TF-IDF vector representing the description of // Google to the console. SparseVector <double> .ReadOnly googVec = bowSpc.BowVectors[4192 - 1]; // The description of // Google can be found at the row 4192 in the corpus. foreach (IdxDat <double> termInfo in googVec) { Console.WriteLine("{0} : {1}", bowSpc.Words[termInfo.Idx].MostFrequentForm, termInfo.Dat); } // Extract the top 5 terms with the highest TF-IDF weights // from the vector representing Google. Console.WriteLine(bowSpc.GetKeywordsStr(googVec, 5)); // Output: google, relevant, targeted advertising, search, // index }