Exemplo n.º 1
0
        public static void AnalyzeFieldValues(string inputPath, string fieldName, Func <string, string> convertValueFunc = null)
        {
            if (convertValueFunc == null)
            {
                convertValueFunc = str => str;
            }

            string       fileName = StringOperations.EnsureFolderEnd(inputPath) + fieldName + ".txt";
            StreamWriter sw       = new StreamWriter(fileName);

            Counter <string> counter = new Counter <string>();
            var indexReader          = LuceneOperations.GetIndexReader(inputPath);

            for (int iDoc = 0; iDoc < indexReader.NumDocs(); iDoc++)
            {
                var doc   = indexReader.Document(iDoc);
                var value = doc.Get(fieldName);
                counter.Add(convertValueFunc(value));
            }
            foreach (var kvp in counter.GetCountDictionary().OrderBy(kvp => kvp.Key))
            {
                sw.WriteLine(kvp.Key + "\t\t" + kvp.Value);
                Console.WriteLine(kvp.Key + "\t\t" + kvp.Value);
            }

            sw.WriteLine("total: " + indexReader.NumDocs());
            sw.Flush();
            sw.Close();

            indexReader.Close();
            Console.ReadKey();
        }
Exemplo n.º 2
0
        public void Start()
        {
            string path = @"D:\Project\StreamingRoseRiver\Obama\Tree1-5000-1e-100-LastYear\";

            HashSet <int> removeIndices  = Util.GetHashSet(Util.GetIntArray(0, 70));
            HashSet <int> selectIndices  = Util.GetHashSet(Util.GetIntArray(71, 124));
            int           substractCount = removeIndices.Count;

            //foreach (var subPath in Directory.GetDirectories(path))
            {
                foreach (var fullFileName in Directory.GetFiles(path)) //Directory.GetFiles(subPath))
                {
                    var fileName   = StringOperations.GetFileName(fullFileName);
                    var fileFolder = StringOperations.GetFolder(fullFileName);

                    int index = GetIndexInFile(fileName);
                    if (index == -1 || removeIndices.Contains(index))
                    {
                        File.Delete(fullFileName);
                    }
                    else
                    {
                        File.Copy(fullFileName, StringOperations.EnsureFolderEnd(fileFolder) + ReplaceFileNameIndex(fileName, index - substractCount), true);
                        File.Delete(fullFileName);
                    }
                }
            }
        }
Exemplo n.º 3
0
 private IndexWriter GetWriter(Document doc)
 {
     if (!Configure.IsSplitByTime)
     {
         return(_writers.Values.First());
     }
     else
     {
         var         dateTime = StringOperations.ParseDateTimeString(doc.Get(Configure.TimeField), Configure.ParseTimeFormat);
         string      projDate = _dateTransferFunc(dateTime.ToString(_dateFormatString));
         IndexWriter writer;
         if (!_writers.TryGetValue(projDate, out writer))
         {
             string path = StringOperations.EnsureFolderEnd(Configure.OutputPath) + projDate;
             writer             = LuceneOperations.GetIndexWriter(path);
             _writers[projDate] = writer;
         }
         return(writer);
     }
 }
Exemplo n.º 4
0
        public static void AnalyzeTwitterWordDistribution(string inputPath, TokenizeConfig tokenConfig)
        {
            var indexReader = LuceneOperations.GetIndexReader(inputPath);
            var docNum      = indexReader.NumDocs();

            int[] docWordCnt     = new int[docNum];
            int[] docUniqWordCnt = new int[docNum];
            Dictionary <string, int> wordDocCntDict = new Dictionary <string, int>();
            Dictionary <string, int> wordOccCntDict = new Dictionary <string, int>();

            var fieldWeights = tokenConfig.TokenizerType == TokenizerType.FeatureVector
                ? BingNewsFields.FeatureVectorFieldWeights
                : BingNewsFields.NewsFieldWeights;

            ProgramProgress progress = new ProgramProgress(docNum);

            for (int iDoc = 0; iDoc < docNum; iDoc++)
            {
                var document = indexReader.Document(iDoc);
                var content  = LuceneOperations.GetContent(document, fieldWeights);

                var words       = NLPOperations.Tokenize(content, tokenConfig);
                var uniqueWords = new HashSet <string>(words);
                docWordCnt[iDoc]     = words.Count;
                docUniqWordCnt[iDoc] = uniqueWords.Count;

                foreach (var word in uniqueWords)
                {
                    if (!wordDocCntDict.ContainsKey(word))
                    {
                        wordDocCntDict.Add(word, 0);
                    }
                    wordDocCntDict[word]++;
                }

                foreach (var word in words)
                {
                    if (!wordOccCntDict.ContainsKey(word))
                    {
                        wordOccCntDict.Add(word, 0);
                    }
                    wordOccCntDict[word]++;
                }

                progress.PrintIncrementExperiment();
            }
            progress.PrintTotalTime();

            indexReader.Close();

            //Statistics
            DoubleStatistics statDocWordCnt     = new DoubleStatistics();
            DoubleStatistics statDocUniqWordCnt = new DoubleStatistics();
            DoubleStatistics statWordDocCnt     = new DoubleStatistics();
            DoubleStatistics statWordOccCnt     = new DoubleStatistics();

            for (int iDoc = 0; iDoc < docNum; iDoc++)
            {
                statDocWordCnt.AddNumber(docWordCnt[iDoc]);
                statDocUniqWordCnt.AddNumber(docUniqWordCnt[iDoc]);
            }

            foreach (var kvp in wordDocCntDict)
            {
                statWordDocCnt.AddNumber(kvp.Value);
            }

            foreach (var kvp in wordOccCntDict)
            {
                statWordOccCnt.AddNumber(kvp.Value);
            }


            Console.WriteLine(statDocWordCnt.ToString("statDocWordCnt"));
            Console.WriteLine(statDocUniqWordCnt.ToString("statDocUniqWordCnt"));
            Console.WriteLine(statWordDocCnt.ToString("statWordDocCnt"));
            Console.WriteLine(statWordOccCnt.ToString("wordOccCnt"));

            //Hist
            var docWordCntHist       = new DoubleHistogram(docWordCnt.Select(i => (double)i), (double)1);
            var docUniqueWordCntList = new DoubleHistogram(docUniqWordCnt.Select(i => (double)i), (double)1);
            var wordDocCntHist       = new DoubleHistogram(wordDocCntDict.Select(kvp => (double)kvp.Value), 1000);
            var wordDocCntHist2      = new DoubleHistogram(wordDocCntDict.Select(kvp => (double)kvp.Value), (double)1);

            docWordCntHist.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "docWordCntHist.csv");
            docUniqueWordCntList.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "docUniqueWordCntList.csv");
            wordDocCntHist.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "wordDocCntHist.csv");
            wordDocCntHist2.PrintToFile(StringOperations.EnsureFolderEnd(inputPath) + "wordDocCntHist2.csv");

            Console.Read();
        }
        public void Transform(string inputFolder, string indexPath, HashSet <string> keywords)
        {
            Console.WriteLine("Start to search words: " + StringOperations.GetMergedString(keywords));
            Console.WriteLine("InputFolder: " + inputFolder + "\n");

            string notParseSpecString = "Temp-DoNotParse";

            inputFolder = StringOperations.EnsureFolderEnd(inputFolder);

            string[] schema = new[]
            {
                "CreatedAt", "Text", "IsRetweet", "Retweeted", "RetweetCount",
                "UserScreenName", "UserId", "UserFollowersCount", "UserFriendsCount"
            };
            var schemeDict            = Util.GetInvertedDictionary(schema);
            var textFieldIndex        = schemeDict["Text"];
            var createdTimeFieldIndex = schemeDict["CreatedAt"];
            var userIdFieldIndex      = schemeDict["UserId"];

            //string outputPath = inputFolder + notParseSpecString + "\\";
            //if (Directory.Exists(outputPath))
            //{
            //    Directory.Delete(outputPath, true);
            //}
            //Directory.CreateDirectory(outputPath);
            //var indexPath = outputPath + "Index\\";
            if (Directory.Exists(indexPath))
            {
                Directory.Delete(indexPath, true);
            }

            var files = Directory.GetFiles(inputFolder, "*.*", SearchOption.AllDirectories);

            //Preprocess
            Console.WriteLine("Start preprocesing...");
            ProgramProgress progress   = new ProgramProgress(files.Length);
            int             estiDocCnt = 0;

            foreach (var file in files)
            {
                estiDocCnt += FileOperations.GetLineCount(file);
                progress.PrintIncrementExperiment();
            }
            progress.PrintTotalTime();
            Console.WriteLine("Estimate tweet count: " + estiDocCnt + "\n");

            //Parse
            Console.WriteLine("Start parsing...");

            var            indexWriter    = LuceneOperations.GetIndexWriter(indexPath);
            TokenizeConfig tokenizeConfig = new TokenizeConfig(TokenizerType.Twitter);

            progress = new ProgramProgress(estiDocCnt);
            var sep             = new char[] { '\t' };
            int uniqDocFoundCnt = 0;
            int docFoundCnt     = 0;
            int docCnt          = 0;

            ThreeLayerHashSet <string, long, string> hash3Layer = new ThreeLayerHashSet <string, long, string>();
            int notUsedDocCnt = 0;

            foreach (var file in files)
            {
                if (file.Contains(notParseSpecString))
                {
                    continue;
                }

                if (file.EndsWith(".txt"))
                {
                    var    sr = new StreamReader(file);
                    string line;

                    while ((line = sr.ReadLine()) != null)
                    {
                        var tokens = line.Split(sep, StringSplitOptions.None);
                        if (tokens.Length != schema.Length)
                        {
                            notUsedDocCnt++;
                            continue;
                            //throw new ArgumentException();
                        }

                        var  words           = NLPOperations.Tokenize(tokens[textFieldIndex], tokenizeConfig);
                        bool isContainSearch = false;
                        foreach (var word in words)
                        {
                            if (keywords.Contains(word))
                            {
                                isContainSearch = true;
                                break;
                            }
                        }
                        if (isContainSearch)
                        {
                            string createdAt = tokens[createdTimeFieldIndex];
                            long   userId    = long.Parse(tokens[userIdFieldIndex]);
                            string text      = tokens[textFieldIndex];

                            if (!hash3Layer.Contains(createdAt, userId, text))
                            {
                                var document = new Document();
                                for (int i = 0; i < schema.Length; i++)
                                {
                                    document.Add(new Field(schema[i], tokens[i], Field.Store.YES, Field.Index.ANALYZED));
                                }
                                indexWriter.AddDocument(document);

                                hash3Layer.Add(createdAt, userId, text);

                                uniqDocFoundCnt++;
                            }
                            docFoundCnt++;
                        }
                        docCnt++;
                        progress.PrintIncrementExperiment(string.Format("uniqDocFound: {0} out of {1} ({2}%), docFoundUnqiueRatio: {3}%",
                                                                        uniqDocFoundCnt, docCnt, 100 * uniqDocFoundCnt / docCnt, (docFoundCnt == 0 ? 0 : (100 * uniqDocFoundCnt / docFoundCnt))));
                    }

                    sr.Close();
                }
            }
            progress.PrintTotalTime();

            Console.WriteLine(string.Format("uniqDocFound: {0} out of {1} ({2}%), docFoundUnqiueRatio: {3}%",
                                            uniqDocFoundCnt, docCnt, 100 * uniqDocFoundCnt / docCnt, 100 * uniqDocFoundCnt / docFoundCnt));
            Console.WriteLine("Not used doc count: " + notUsedDocCnt);

            Console.WriteLine("Start writing index...");
            indexWriter.Commit();
            indexWriter.Close();

            Console.WriteLine("Finish");
            Console.ReadKey();
        }