コード例 #1
0
        /// <summary>
        /// Searches for a specific list of articles containing key words in abstract.
        /// </summary>
        /// <param name="keyWords"></param>
        /// <returns></returns>
        public Articles SearchArticles(string phraseToSearch)
        {
            var result        = new Articles();
            var smallKeyWords = phraseToSearch
                                .ToLower()
                                .Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries)
                                .Distinct()
                                .ToList();

            var  commonArticles            = new Articles();
            bool haveCommonArticlesBeenSet = false;

            // First check if a key word is found or not.
            foreach (var keyWord in smallKeyWords)
            {
                if (keyWord.Length < 3)
                {
                    // no intereset in short words
                    continue;
                }
                var keyWordIndex = keyWord.ToLower().Substring(0, 3);
                if (!ArticlesHubDictionary.ContainsKey(keyWordIndex))
                {
                    keyWordIndex = REST;
                }

                // now look at contents of index file
                var articlesHub = new ArticlesHub();
                var indexPath   = Path.Combine(_outputDirectory, "index", $"index_{keyWordIndex}.bin.zip");
                if (!File.Exists(indexPath))
                {
                    continue;
                }

                articlesHub.Deserialize(indexPath);

                // now search for keywords in this hub
                if (articlesHub.KeyWordsDictionary.Dictionary.Keys.Contains(keyWord))
                {
                    var articlesFound      = articlesHub.KeyWordsDictionary.Dictionary[keyWord];
                    var commonArticlesCopy = new Articles();

                    if (haveCommonArticlesBeenSet)
                    {
                        var commonArticlesHashSet = commonArticles
                                                    .List.Select(x => Zipper.Unzip(x.ZippedJsonFileName.ToByteArray()))
                                                    .ToHashSet();

                        foreach (var articleFound in articlesFound.List)
                        {
                            var articleFoundFileName = Zipper.Unzip(articleFound.ZippedJsonFileName.ToByteArray());
                            var hasCommonArticle     = commonArticlesHashSet.Contains(articleFoundFileName);
                            if (hasCommonArticle)
                            {
                                var isTitleEmpty =
                                    String.IsNullOrWhiteSpace(Zipper.Unzip(articleFound.ZippedTitle.ToByteArray()));
                                commonArticlesCopy.List.Add(articleFound);
                            }
                        }

                        commonArticles = commonArticlesCopy;
                    }
                    else
                    {
                        var nonEmptyTitlesList = articlesFound.List
                                                 .Where(x => !String.IsNullOrWhiteSpace(Zipper.Unzip(x.ZippedTitle.ToByteArray())));
                        commonArticles.List.AddRange(nonEmptyTitlesList);
                        haveCommonArticlesBeenSet = true;
                    }
                }
            }

            return(commonArticles);
        }
コード例 #2
0
        /// <summary>
        /// Adds a selected article to dictionary of articles which is periodically saved every reset counter.
        /// </summary>
        /// <param name="convertedArticlesIndex">How many .bin.zip packed files are created.</param>
        /// <param name="resetCounter">Sets maximum amount of packed json articles per bin.zip file.</param>
        /// <param name="articleFileInfo">File info of JSON article.</param>
        /// <param name="filesCount">How many JSON files are analysed in total.</param>
        /// <param name="dictionaryOfJsonArticles">Temporary dictionary of articles periodically saved.</param>
        private void AddArticle(
            ref int convertedArticlesIndex,
            int resetCounter,
            FileInfo articleFileInfo,
            string articlesOutputDir,
            int filesCount,
            Dictionary <String, String> dictionaryOfJsonArticles)
        {
            var baseOutputFileName = "converted_articles_" + convertedArticlesIndex + ".bin";
            var articleJsonContent = File.ReadAllText(articleFileInfo.FullName);
            var articleAbstract    = JsonArticleToDocument.GetArticleAbstract(articleJsonContent);
            var articleTitle       = JsonArticleToDocument.GetArticleTitle(articleJsonContent);

            if (String.IsNullOrWhiteSpace(articleAbstract) && String.IsNullOrWhiteSpace(articleTitle))
            {
                return;
            }

            var article = new Article()
            {
                ZippedJsonFileName   = ByteString.CopyFrom(Zipper.Zip(articleFileInfo.Name)),
                ZippedBucketFileName = ByteString.CopyFrom(Zipper.Zip(baseOutputFileName + ".zip")),
                ZippedTitle          = ByteString.CopyFrom(Zipper.Zip(articleTitle)),
                CounterIndex         = convertedArticlesIndex
            };

            List <String> keyWords = new List <String>();

            if (!String.IsNullOrWhiteSpace(articleAbstract))
            {
                // abstract takes precedence
                List <String> list = articleAbstract
                                     .ToLower()
                                     .Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries)
                                     .Distinct()
                                     .ToList();
                keyWords.AddRange(list);
            }

            if (!String.IsNullOrWhiteSpace(articleTitle))
            {
                // is followed by title.
                List <String> list = articleTitle
                                     .ToLower()
                                     .Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries)
                                     .Distinct()
                                     .ToList();
                keyWords.AddRange(list);
            }

            // make sure no repetitions
            keyWords = keyWords
                       .Distinct()
                       .ToList();

            // Process every single key word.
            foreach (var keyWord in keyWords)
            {
                if (keyWord.Length < 3)
                {
                    // we dont care for less than 3 letters
                    continue;
                }

                var firstThreeLetters        = keyWord.ToLower().Substring(0, 3);
                var articlesHubDictionaryKey = firstThreeLetters;
                if (!ArticlesHubDictionary.Keys.Contains(firstThreeLetters))
                {
                    articlesHubDictionaryKey = REST;
                }

                var articlesHub = ArticlesHubDictionary[articlesHubDictionaryKey];
                articlesHub.AddArticle(keyWord, article);
            }

            dictionaryOfJsonArticles[articleFileInfo.Name] = articleJsonContent;

            if (dictionaryOfJsonArticles.Keys.Count >= resetCounter)
            {
                var outputFileName = articlesOutputDir + baseOutputFileName;
                SerializeDictionary(dictionaryOfJsonArticles, outputFileName);
                convertedArticlesIndex++;
                Console.WriteLine("Converted {0} out of {1} articles.", convertedArticlesIndex * resetCounter, filesCount);
            }
        }