Пример #1
0
        private SparseVectorList GetFeatureVector(Document doc, Dictionary <string, int> lexicon)
        {
            SparseVectorList featurevector = new SparseVectorList();

            int lexiconindexcount = lexicon.Count;

            var content = LuceneOperations.GetDocumentContent(doc, Configure.FieldWeightDict, Configure.LeadingSentenceCntDict);
            var words   = NLPOperations.Tokenize(content, Configure.TokenizeConfig);

            foreach (var word in words)
            {
                int value = 0;
                if (lexicon == null || lexicon.TryGetValue(word, out value) == false)
                {
                    lexicon.Add(word, lexiconindexcount);
                    value = lexiconindexcount;
                    lexiconindexcount++;
                }
                if (!featurevector.Increase(value, 1))
                {
                    featurevector.Insert(value, 1);
                }
            }

            featurevector.ListToArray();
            featurevector.count = featurevector.keyarray.Length;
            //featurevector.SumUpValueArray();
            if (featurevector.count < 1)
            {
                return(null);
            }
            featurevector.InvalidateList();
            featurevector.GetNorm();
            return(featurevector);
        }
Пример #2
0
        public void Start()
        {
            var         reader = LuceneOperations.GetIndexReader(Configure.InputPath);
            var         sw     = new StreamWriter(Configure.OutputPath);
            IndexWriter writer = null;

            if (Configure.IsFilterByWordCount)
            {
                writer = LuceneOperations.GetIndexWriter(Configure.FilterWordCountIndexPath);
            }
            if (Configure.IsLoadFromFeatureVector)
            {
                Configure.TokenizeConfig.TokenizerType = TokenizerType.FeatureVector;
            }

            Console.WriteLine("Total: " + reader.NumDocs());
            int docIndex = 0;

            for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++)
            {
                if (iDoc % 10000 == 0)
                {
                    Console.WriteLine(iDoc);
                    sw.Flush();
                }

                string content = Configure.IsLoadFromFeatureVector ? reader.Document(iDoc).Get(BingNewsFields.FeatureVector) :
                                 LuceneOperations.GetDocumentContent(reader.Document(iDoc), Configure.FieldWeightDict, Configure.LeadingSentenceCntDict);

                List <string> words      = NLPOperations.Tokenize(content, Configure.TokenizeConfig);;
                bool          isPrintDoc = !Configure.IsFilterByWordCount || words.Count >= Configure.MinWordCount;
                if (isPrintDoc)
                {
                    if (Configure.IsFilterByWordCount)
                    {
                        writer.AddDocument(reader.Document(iDoc));
                    }

                    sw.Write(docIndex + " " + docIndex + " ");

                    foreach (var word in words)
                    {
                        sw.Write(word + " ");
                    }
                    sw.Write("\n");

                    docIndex++;
                }
            }

            if (Configure.IsFilterByWordCount)
            {
                writer.Optimize();
                writer.Close();
            }

            sw.Flush();
            sw.Close();
            reader.Close();
        }
Пример #3
0
        private List <int> GetHashtagNumberInappropriateDocuments(IndexReader indexreader, string outputfile)
        {
            Console.WriteLine("==========Remove inappropriate hashtag number documents!==========");

            StreamWriter sw = IsPrintTextFiles ? new StreamWriter(outputfile) : null;
            List <int>   removedDocuments = new List <int>();

            int    docNum        = indexreader.NumDocs();
            string bodyfield     = this.BodyField;
            int    removedDocNum = 0;

            Console.WriteLine("Total documents: {0}", docNum);

            var tokenConfig = new TokenizeConfig(TokenizerType.Hashtag, StopWordsFile.NO);

            for (int idoc = 0; idoc < docNum; idoc++)
            {
                if (idoc % 10000 == 0)
                {
                    if (idoc == 0)
                    {
                        continue;
                    }
                    Console.WriteLine("Process " + idoc + "th document!");
                    Console.WriteLine("Remove {0} out of {1}: {2}%", removedDocNum, idoc, 100 * removedDocNum / idoc);
                    if (IsPrintTextFiles)
                    {
                        sw.Flush();
                    }
                }

                Document document = indexreader.Document(idoc);

                string content = document.Get(bodyfield);
                var    words   = NLPOperations.Tokenize(content, tokenConfig);

                if (words.Count < MinHashtagNumber || words.Count > MaxHashtagNumber)
                {
                    if (IsPrintTextFiles)
                    {
                        sw.WriteLine(DocumentToString(document));
                    }
                    removedDocuments.Add(idoc);
                    removedDocNum++;
                }
            }

            Console.WriteLine("Remove {0} out of {1}: {2}%", removedDocNum, docNum, 100 * removedDocNum / docNum);
            if (IsPrintTextFiles)
            {
                sw.WriteLine("Remove {0} out of {1}: {2}%", removedDocNum, docNum, 100 * removedDocNum / docNum);

                sw.Flush();
                sw.Close();
            }

            return(removedDocuments);
        }
Пример #4
0
        public void Parse(TokenizeConfig tokenizeConfig)
        {
            var           text      = Title + " " + Title + " " + Title + " " + Body;
            var           words     = NLPOperations.Tokenize(text, tokenizeConfig);
            int           wordIndex = 0;
            Counter <int> counter   = new Counter <int>();

            foreach (var word in words)
            {
                if (!KmeansLexicon.Word2IndexDict.TryGetValue(word, out wordIndex))
                {
                    wordIndex = KmeansLexicon.Word2IndexDict.Count;
                    KmeansLexicon.Word2IndexDict.Add(word, wordIndex);
                    KmeansLexicon.Index2WordDict.Add(wordIndex, word);
                }
                counter.Add(wordIndex);
            }
            Vector     = SortUtils.EnsureSortedByKey(counter.GetCountDictionary().ToDictionary(kvp2 => kvp2.Key, kvp2 => (double)kvp2.Value));
            Norm       = Maths.GetVectorLength(Vector);
            Vector     = Maths.GetVectorMultiply(Vector, 1.0 / Norm);
            Norm       = 1;
            _orgVector = Vector;
        }
Пример #5
0
        public static void AnalyzeTwitterWordDistribution(string inputPath, TokenizeConfig tokenConfig)
        {
            var indexReader = LuceneOperations.GetIndexReader(inputPath);
            var docNum      = indexReader.NumDocs();

            int[] docWordCnt     = new int[docNum];
            int[] docUniqWordCnt = new int[docNum];
            Dictionary <string, int> wordDocCntDict = new Dictionary <string, int>();
            Dictionary <string, int> wordOccCntDict = new Dictionary <string, int>();

            var fieldWeights = tokenConfig.TokenizerType == TokenizerType.FeatureVector
                ? BingNewsFields.FeatureVectorFieldWeights
                : BingNewsFields.NewsFieldWeights;

            ProgramProgress progress = new ProgramProgress(docNum);

            for (int iDoc = 0; iDoc < docNum; iDoc++)
            {
                var document = indexReader.Document(iDoc);
                var content  = LuceneOperations.GetContent(document, fieldWeights);

                var words       = NLPOperations.Tokenize(content, tokenConfig);
                var uniqueWords = new HashSet <string>(words);
                docWordCnt[iDoc]     = words.Count;
                docUniqWordCnt[iDoc] = uniqueWords.Count;

                foreach (var word in uniqueWords)
                {
                    if (!wordDocCntDict.ContainsKey(word))
                    {
                        wordDocCntDict.Add(word, 0);
                    }
                    wordDocCntDict[word]++;
                }

                foreach (var word in words)
                {
                    if (!wordOccCntDict.ContainsKey(word))
                    {
                        wordOccCntDict.Add(word, 0);
                    }
                    wordOccCntDict[word]++;
                }

                progress.PrintIncrementExperiment();
            }
            progress.PrintTotalTime();

            indexReader.Close();

            //Statistics
            DoubleStatistics statDocWordCnt     = new DoubleStatistics();
            DoubleStatistics statDocUniqWordCnt = new DoubleStatistics();
            DoubleStatistics statWordDocCnt     = new DoubleStatistics();
            DoubleStatistics statWordOccCnt     = new DoubleStatistics();

            for (int iDoc = 0; iDoc < docNum; iDoc++)
            {
                statDocWordCnt.AddNumber(docWordCnt[iDoc]);
                statDocUniqWordCnt.AddNumber(docUniqWordCnt[iDoc]);
            }

            foreach (var kvp in wordDocCntDict)
            {
                statWordDocCnt.AddNumber(kvp.Value);
            }

            foreach (var kvp in wordOccCntDict)
            {
                statWordOccCnt.AddNumber(kvp.Value);
            }


            Console.WriteLine(statDocWordCnt.ToString("statDocWordCnt"));
            Console.WriteLine(statDocUniqWordCnt.ToString("statDocUniqWordCnt"));
            Console.WriteLine(statWordDocCnt.ToString("statWordDocCnt"));
            Console.WriteLine(statWordOccCnt.ToString("wordOccCnt"));

            //Hist
            var docWordCntHist       = new DoubleHistogram(docWordCnt.Select(i => (double)i), (double)1);
            var docUniqueWordCntList = new DoubleHistogram(docUniqWordCnt.Select(i => (double)i), (double)1);
            var wordDocCntHist       = new DoubleHistogram(wordDocCntDict.Select(kvp => (double)kvp.Value), 1000);
            var wordDocCntHist2      = new DoubleHistogram(wordDocCntDict.Select(kvp => (double)kvp.Value), (double)1);

            docWordCntHist.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "docWordCntHist.csv");
            docUniqueWordCntList.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "docUniqueWordCntList.csv");
            wordDocCntHist.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "wordDocCntHist.csv");
            wordDocCntHist2.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "wordDocCntHist2.csv");

            Console.Read();
        }
        public void Transform(string inputFolder, string indexPath, HashSet <string> keywords)
        {
            Console.WriteLine("Start to search words: " + StringOperations.GetMergedString(keywords));
            Console.WriteLine("InputFolder: " + inputFolder + "\n");

            string notParseSpecString = "Temp-DoNotParse";

            inputFolder = StringOperations.EnsureFolderEnd(inputFolder);

            string[] schema = new[]
            {
                "CreatedAt", "Text", "IsRetweet", "Retweeted", "RetweetCount",
                "UserScreenName", "UserId", "UserFollowersCount", "UserFriendsCount"
            };
            var schemeDict            = Util.GetInvertedDictionary(schema);
            var textFieldIndex        = schemeDict["Text"];
            var createdTimeFieldIndex = schemeDict["CreatedAt"];
            var userIdFieldIndex      = schemeDict["UserId"];

            //string outputPath = inputFolder + notParseSpecString + "\\";
            //if (Directory.Exists(outputPath))
            //{
            //    Directory.Delete(outputPath, true);
            //}
            //Directory.CreateDirectory(outputPath);
            //var indexPath = outputPath + "Index\\";
            if (Directory.Exists(indexPath))
            {
                Directory.Delete(indexPath, true);
            }

            var files = Directory.GetFiles(inputFolder, "*.*", SearchOption.AllDirectories);

            //Preprocess
            Console.WriteLine("Start preprocesing...");
            ProgramProgress progress   = new ProgramProgress(files.Length);
            int             estiDocCnt = 0;

            foreach (var file in files)
            {
                estiDocCnt += FileOperations.GetLineCount(file);
                progress.PrintIncrementExperiment();
            }
            progress.PrintTotalTime();
            Console.WriteLine("Estimate tweet count: " + estiDocCnt + "\n");

            //Parse
            Console.WriteLine("Start parsing...");

            var            indexWriter    = LuceneOperations.GetIndexWriter(indexPath);
            TokenizeConfig tokenizeConfig = new TokenizeConfig(TokenizerType.Twitter);

            progress = new ProgramProgress(estiDocCnt);
            var sep             = new char[] { '\t' };
            int uniqDocFoundCnt = 0;
            int docFoundCnt     = 0;
            int docCnt          = 0;

            ThreeLayerHashSet <string, long, string> hash3Layer = new ThreeLayerHashSet <string, long, string>();
            int notUsedDocCnt = 0;

            foreach (var file in files)
            {
                if (file.Contains(notParseSpecString))
                {
                    continue;
                }

                if (file.EndsWith(".txt"))
                {
                    var    sr = new StreamReader(file);
                    string line;

                    while ((line = sr.ReadLine()) != null)
                    {
                        var tokens = line.Split(sep, StringSplitOptions.None);
                        if (tokens.Length != schema.Length)
                        {
                            notUsedDocCnt++;
                            continue;
                            //throw new ArgumentException();
                        }

                        var  words           = NLPOperations.Tokenize(tokens[textFieldIndex], tokenizeConfig);
                        bool isContainSearch = false;
                        foreach (var word in words)
                        {
                            if (keywords.Contains(word))
                            {
                                isContainSearch = true;
                                break;
                            }
                        }
                        if (isContainSearch)
                        {
                            string createdAt = tokens[createdTimeFieldIndex];
                            long   userId    = long.Parse(tokens[userIdFieldIndex]);
                            string text      = tokens[textFieldIndex];

                            if (!hash3Layer.Contains(createdAt, userId, text))
                            {
                                var document = new Document();
                                for (int i = 0; i < schema.Length; i++)
                                {
                                    document.Add(new Field(schema[i], tokens[i], Field.Store.YES, Field.Index.ANALYZED));
                                }
                                indexWriter.AddDocument(document);

                                hash3Layer.Add(createdAt, userId, text);

                                uniqDocFoundCnt++;
                            }
                            docFoundCnt++;
                        }
                        docCnt++;
                        progress.PrintIncrementExperiment(string.Format("uniqDocFound: {0} out of {1} ({2}%), docFoundUnqiueRatio: {3}%",
                                                                        uniqDocFoundCnt, docCnt, 100 * uniqDocFoundCnt / docCnt, (docFoundCnt == 0 ? 0 : (100 * uniqDocFoundCnt / docFoundCnt))));
                    }

                    sr.Close();
                }
            }
            progress.PrintTotalTime();

            Console.WriteLine(string.Format("uniqDocFound: {0} out of {1} ({2}%), docFoundUnqiueRatio: {3}%",
                                            uniqDocFoundCnt, docCnt, 100 * uniqDocFoundCnt / docCnt, 100 * uniqDocFoundCnt / docFoundCnt));
            Console.WriteLine("Not used doc count: " + notUsedDocCnt);

            Console.WriteLine("Start writing index...");
            indexWriter.Commit();
            indexWriter.Close();

            Console.WriteLine("Finish");
            Console.ReadKey();
        }
Пример #7
0
        public void Start()
        {
            if (!outputpath.EndsWith("\\"))
            {
                outputpath += "\\";
            }

            var tokenizerConfig = new TokenizeConfig(tokenizeConfigStr);

            var searcher    = LuceneOperations.GetIndexSearcher(inputpath);
            var max_doc_num = (int)(searchDocRatio * searcher.GetIndexReader().NumDocs());
            var scoredDocs  = LuceneOperations.Search(searcher, searchfield, keywords, max_doc_num);

            int  iter      = 0;
            bool bContinue = threshold == 0 ? false : true;

            while (bContinue && iter < 5)
            {
                iter++;
                Console.WriteLine("iteration------------------" + iter);
                List <string> keywordsNew;
                #region Calculate Keywords
                var counter = new Counter <string>();
                foreach (var scoredDoc in scoredDocs)
                {
                    var doc     = searcher.Doc(scoredDoc.doc);
                    var content = doc.Get(searchfield);
                    foreach (var word in NLPOperations.Tokenize(content, tokenizerConfig))
                    {
                        counter.Add(word);
                    }
                }
                keywordsNew = counter.GetMostFreqObjs(keywordNum);
                #endregion

                var scoredDocsNew = LuceneOperations.Search(searcher, searchfield, keywordsNew, max_doc_num);
                #region Test whether exit
                int repeatNum = 0;
                var docIDs    = new HashSet <int>();
                foreach (var scoredDoc in scoredDocs)
                {
                    docIDs.Add(scoredDoc.doc);
                }

                foreach (var scoredDocNew in scoredDocsNew)
                {
                    if (docIDs.Contains(scoredDocNew.doc))
                    {
                        repeatNum++;
                    }
                }

                bContinue = (double)repeatNum / scoredDocs.Length < threshold;
                #endregion

                Console.WriteLine(repeatNum + "  " + scoredDocsNew.Length);

                keywords   = keywordsNew;
                scoredDocs = scoredDocsNew;

                Console.WriteLine(StringOperations.GetMergedString(keywords));
            }

            max_doc_num = (int)(saveDocRatio * searcher.GetIndexReader().NumDocs());
            scoredDocs  = LuceneOperations.Search(searcher, searchfield, keywords, max_doc_num);
            var writer = LuceneOperations.GetIndexWriter(outputpath);
            foreach (var scoredDoc in scoredDocs)
            {
                Document doc = searcher.Doc(scoredDoc.doc);
                writer.AddDocument(doc);
            }
            writer.Optimize();
            writer.Close();

            if (isPrintRemovedDocuments)
            {
                var sw             = new StreamWriter(outputpath + "removeDocuments.txt");
                var selectedDocIDs = new HashSet <int>();
                foreach (var scoredDoc in scoredDocs)
                {
                    selectedDocIDs.Add(scoredDoc.doc);
                }

                var reader = searcher.GetIndexReader();
                for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++)
                {
                    if (!selectedDocIDs.Contains(iDoc))
                    {
                        sw.WriteLine(LuceneOperations.GetDocumentString(reader.Document(iDoc)));
                    }
                }
                reader.Close();
                sw.Flush();
                sw.Close();
            }

            searcher.Close();

            Console.WriteLine("Done");
            Console.ReadKey();
        }
Пример #8
0
        public void TransformWithFileNames(string[] files, string indexPath, HashSet <string> searchHashSet, SearchSpinn3rType searchType)
        {
            double         tweetCnt       = 0;
            TokenizeConfig tokenizeConfig = new TokenizeConfig(TokenizerType.Twitter);
            var            indexWriter    = LuceneOperations.GetIndexWriter(indexPath);

            var progress      = new ProgramProgress(files.Length);
            int docFoundCount = 0;
            int totalDocCount = 0;

            foreach (var file in files)
            {
                FileOperations.ReadJsonFile <Spinn3rTwitterData>(file, (data) =>
                {
                    tweetCnt += data.count;
                    //Console.WriteLine(data.count);
                    //Console.WriteLine(data.items[0].main);
                    foreach (var tweet in data.items)
                    {
                        if (tweet.lang != "en")
                        {
                            continue;
                        }

                        bool isContainSearch = false;
                        switch (searchType)
                        {
                        case SearchSpinn3rType.Main:
                            var words = NLPOperations.Tokenize(tweet.main, tokenizeConfig);
                            foreach (var word in words)
                            {
                                if (searchHashSet.Contains(word))
                                {
                                    isContainSearch = true;
                                    break;
                                }
                            }
                            break;

                        case SearchSpinn3rType.User:
                            isContainSearch = searchHashSet.Contains(tweet.author_link.ToLower());
                            break;

                        default:
                            throw new ArgumentException();
                        }

                        if (isContainSearch)
                        {
                            var document = new Document();
                            document.Add(new Field(TweetFields.TweetId, tweet.permalink, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.Text, tweet.main, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserScreenName, tweet.author_link, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserName, tweet.author_name, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.Tags, StringOperations.ConvertNullStringToEmpty(StringOperations.GetMergedString(tweet.tags)), Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.CreatedAt, tweet.published, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.Location, tweet.source_location, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserDescription, tweet.source_description, Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserFollowersCount, tweet.source_followers.ToString(), Field.Store.YES, Field.Index.ANALYZED));
                            document.Add(new Field(TweetFields.UserFriendsCount, tweet.source_following.ToString(), Field.Store.YES, Field.Index.ANALYZED));
                            indexWriter.AddDocument(document);
                            docFoundCount++;
                        }
                        totalDocCount++;
                    }
                });
                progress.PrintIncrementExperiment(string.Format("docFound: {0} out of {1} ({2}%)", docFoundCount, totalDocCount, 100 * docFoundCount / totalDocCount));
            }
            progress.PrintTotalTime();

            Console.WriteLine("Final docFound: {0} out of {1} ({2}%)", docFoundCount, totalDocCount, 100 * docFoundCount / totalDocCount);

            Console.WriteLine("Start writing index...");
            indexWriter.Commit();
            indexWriter.Close();

            Util.ProgramFinishHalt();
        }
Пример #9
0
        public void Start()
        {
            if (!Configure.InputPath.EndsWith("\\"))
            {
                Configure.InputPath += "\\";
            }
            var reader     = LuceneOperations.GetIndexReader(Configure.InputPath);
            var docNum     = reader.NumDocs();
            var docNumPart = docNum / 100;

            Console.WriteLine("Total: " + docNum);

            Random random = new Random(Configure.SampleSeed == -1 ? (int)DateTime.Now.Ticks : Configure.SampleSeed);

            //Topwords
            var counter = new Counter <string>();

            for (int iDoc = 0; iDoc < docNum; iDoc++)
            {
                if (iDoc % docNumPart == 0)
                {
                    Console.WriteLine(iDoc + "\t" + (iDoc / docNumPart) + "%");
                }
                if (random.NextDouble() > Configure.SampleRatio)
                {
                    continue;
                }

                var doc     = reader.Document(iDoc);
                var content = LuceneOperations.GetDocumentContent(doc, Configure.FieldWeightDict, Configure.LeadingSentenceCntDict);
                var words   = NLPOperations.Tokenize(content, Configure.TokenizeConfig);
                foreach (var word in words)
                {
                    counter.Add(word);
                }
            }
            var topwords        = counter.GetMostFreqObjs(Configure.TopWordCount);
            var wordCounterDict = counter.GetCountDictionary();

            var swTopWords = new StreamWriter(Configure.InputPath + "TopWords.txt");

            foreach (var topword in topwords)
            {
                swTopWords.WriteLine(topword);
            }
            swTopWords.Flush();
            swTopWords.Close();

            //CoOccurrence
            if (Configure.IsPrintCooccurrence)
            {
                var k = topwords.Count;
                var occurCounterDict = new Dictionary <string, Counter <string> >();
                foreach (var topword in topwords)
                {
                    occurCounterDict.Add(topword, new Counter <string>());
                }
                for (int iDoc = 0; iDoc < docNum; iDoc++)
                {
                    if (iDoc % docNumPart == 0)
                    {
                        Console.WriteLine(iDoc + "\t" + (iDoc / docNumPart) + "%");
                    }
                    if (random.NextDouble() > Configure.SampleRatio)
                    {
                        continue;
                    }

                    var doc     = reader.Document(iDoc);
                    var content = LuceneOperations.GetDocumentContent(doc, Configure.FieldWeightDict, Configure.LeadingSentenceCntDict);
                    var words   = Util.GetHashSet(NLPOperations.Tokenize(content, Configure.TokenizeConfig));
                    foreach (var word in words)
                    {
                        if (occurCounterDict.ContainsKey(word))
                        {
                            var occurCounter = occurCounterDict[word];
                            foreach (var word2 in words)
                            {
                                if (word2 == word)
                                {
                                    continue;
                                }
                                if (occurCounterDict.ContainsKey(word2))
                                {
                                    occurCounter.Add(word2);
                                }
                            }
                        }
                    }
                }
                var heapSort = new HeapSortDouble(Configure.TopOccurrenceCount);
                var pairDict = new Dictionary <int, Tuple <string, string> >();
                var iPair    = 0;
                foreach (var kvp in occurCounterDict)
                {
                    var word         = kvp.Key;
                    var occurCounter = kvp.Value;
                    foreach (var kvp2 in occurCounter.GetCountDictionary())
                    {
                        heapSort.Insert(iPair, kvp2.Value);
                        pairDict.Add(iPair, new Tuple <string, string>(word, kvp2.Key));
                        iPair++;
                    }
                }

                var swCoOccurrence = new StreamWriter(Configure.InputPath + "CoOccurrence.txt");
                foreach (var kvp in heapSort.GetSortedDictionary())
                {
                    var pair = pairDict[kvp.Key];
                    swCoOccurrence.WriteLine("{0} - {1}\t{2}",
                                             pair.Item1, pair.Item2, kvp.Value);
                }

                swCoOccurrence.Flush();
                swCoOccurrence.Close();
            }

            reader.Close();
        }
Пример #10
0
        public static void Test()
        {
            string indexPath = @"C:\Users\v-xitwan\Desktop\temp\WeiboIndex\WeiboSortByHotIndex_Time_RemoveNoise2_RemoveSimilar2";
            var    reader    = LuceneOperations.GetIndexReader(indexPath);
            //var keywords = new string[]{"街","信","死","女","清","刷","骂","愿","爱","查","舰","版","通","岁","撕"};

            //foreach (var keyword in keywords)
            {
                var sw = new StreamWriter(@"C:\Users\v-xitwan\Desktop\temp\WeiboIndex\TestTokenizer" + "Stat" + ".txt", false,
                                          Encoding.UTF8);
                //ChineseWordBreaker chineseWordBreaker = new ChineseWordBreaker(@"Utils\Lib\WordBreaker\");
                int cnt1 = 0, cnt2 = 0;
                int cnt1all = 0, cnt2all = 0;

                for (int iDoc = 0; iDoc < reader.NumDocs(); iDoc++)
                {
                    string sentence = reader.Document(iDoc).Get("NewsArticleDescription");

                    var words1 = NLPOperations.Tokenize(sentence, new TokenizeConfig(TokenizerType.ICTCLAS, StopWordsFile.CH));
                    var words2 = NLPOperations.Tokenize(sentence, new TokenizeConfig(TokenizerType.ChineseWordBreaker, StopWordsFile.CH));

                    //bool isPrint = false;
                    //foreach (var word in words1)
                    //    if (word.Length == 1)
                    //    {
                    //        isPrint = true;
                    //        cnt1++;
                    //    }
                    //foreach (var word in words2)
                    //    if (word.Length == 2)
                    //    {
                    //        isPrint = true;
                    //        cnt2++;
                    //    }
                    cnt1all += words1.Count;
                    cnt2all += words2.Count;

                    //if (isPrint)
                    //{
                    //    sw.WriteLine("-------------{0}-------------", iDoc);
                    //    sw.WriteLine(sentence);
                    //    sw.WriteLine("[ICT]\t" + StringOperations.GetMergedString(words1));
                    //    sw.WriteLine("[CWB]\t" + StringOperations.GetMergedString(words2));

                    //    sw.WriteLine("[ICT--]\t" + Marshal.PtrToStringAnsi(NLPIR_ParagraphProcess(sentence, 1)));
                    //    //sw.WriteLine("[CWB--]\t" + chineseWordBreaker.GetResult(sentence));
                    //    sw.WriteLine();

                    //    sw.Flush();
                    //}
                }

                sw.WriteLine("cnt1 = " + cnt1);
                sw.WriteLine("cnt2 = " + cnt2);
                sw.WriteLine("cnt1all = " + cnt1all);
                sw.WriteLine("cnt2all = " + cnt2all);

                sw.Flush();
                sw.Close();
            }
        }
Пример #11
0
        List <int> GetLanguageErrorDocuments(IndexReader indexreader, string outputfile)
        {
            Console.WriteLine("==========Remove language error documents!==========");

            StreamWriter sw = IsPrintTextFiles ? new StreamWriter(outputfile) : null;
            List <int>   removedDocuments = new List <int>();
            var          stopWords        = IsEnglish ?
                                            FileOperations.LoadKeyWordFile(StopWordsFile.EN, true) :
                                            FileOperations.LoadKeyWordFile(StopWordsFile.CH, false);
            var stopHash = Util.GetHashSet(stopWords);

            int    docNum        = indexreader.NumDocs();
            string titlefield    = this.TitleField;
            string bodyfield     = this.BodyField;
            int    removedDocNum = 0;

            Console.WriteLine("Total documents: {0}", docNum);

            var tokenConfig = new TokenizeConfig(IsEnglish ? TokenizerType.Standard : TokenizerType.ICTCLAS, StopWordsFile.NO);
            DoubleStatistics stat_percent  = new DoubleStatistics();
            DoubleStatistics stat_absolute = new DoubleStatistics();

            for (int idoc = 0; idoc < docNum; idoc++)
            {
                if (idoc % 10000 == 0)
                {
                    if (idoc == 0)
                    {
                        continue;
                    }
                    Console.WriteLine("Process " + idoc + "th document!");
                    Console.WriteLine("Remove {0} out of {1}: {2}%", removedDocNum, idoc, 100 * removedDocNum / idoc);
                    if (IsPrintTextFiles)
                    {
                        sw.Flush();
                    }
                }

                Document document = indexreader.Document(idoc);

                string content = document.Get(titlefield) + " " + document.Get(bodyfield);
                if (IsEnglish)
                {
                    content = content.ToLower();
                }
                var words    = NLPOperations.Tokenize(content, tokenConfig);
                var termCnt0 = words.Count;
                var termCnt1 = 0;
                foreach (var word in words)
                {
                    if (!stopHash.Contains(word))
                    {
                        termCnt1++;
                    }
                }

                if (((double)termCnt0 - termCnt1) / termCnt0 < MinLanguageCorrectRatio)
                {
                    if (IsPrintTextFiles)
                    {
                        sw.WriteLine(DocumentToString(document));
                    }
                    removedDocuments.Add(idoc);
                    removedDocNum++;
                }
                else
                {
                    stat_absolute.AddNumber(termCnt0 - termCnt1);
                    stat_percent.AddNumber((100.0) * (termCnt0 - termCnt1) / termCnt0);
                }
            }

            Console.WriteLine("Remove {0} out of {1}: {2}%", removedDocNum, docNum, 100 * removedDocNum / docNum);
            if (IsPrintTextFiles)
            {
                sw.WriteLine("Remove {0} out of {1}: {2}%", removedDocNum, docNum, 100 * removedDocNum / docNum);
                sw.Flush();
                sw.Close();
            }

            Console.WriteLine(stat_percent.ToString("stat_percent"));
            Console.WriteLine(stat_absolute.ToString("stat_absolute"));

            return(removedDocuments);
        }