/// <summary> /// Searches for a specific list of articles containing key words in abstract. /// </summary> /// <param name="keyWords"></param> /// <returns></returns> public Articles SearchArticles(string phraseToSearch) { var result = new Articles(); var smallKeyWords = phraseToSearch .ToLower() .Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries) .Distinct() .ToList(); var commonArticles = new Articles(); bool haveCommonArticlesBeenSet = false; // First check if a key word is found or not. foreach (var keyWord in smallKeyWords) { if (keyWord.Length < 3) { // no intereset in short words continue; } var keyWordIndex = keyWord.ToLower().Substring(0, 3); if (!ArticlesHubDictionary.ContainsKey(keyWordIndex)) { keyWordIndex = REST; } // now look at contents of index file var articlesHub = new ArticlesHub(); var indexPath = Path.Combine(_outputDirectory, "index", $"index_{keyWordIndex}.bin.zip"); if (!File.Exists(indexPath)) { continue; } articlesHub.Deserialize(indexPath); // now search for keywords in this hub if (articlesHub.KeyWordsDictionary.Dictionary.Keys.Contains(keyWord)) { var articlesFound = articlesHub.KeyWordsDictionary.Dictionary[keyWord]; var commonArticlesCopy = new Articles(); if (haveCommonArticlesBeenSet) { var commonArticlesHashSet = commonArticles .List.Select(x => Zipper.Unzip(x.ZippedJsonFileName.ToByteArray())) .ToHashSet(); foreach (var articleFound in articlesFound.List) { var articleFoundFileName = Zipper.Unzip(articleFound.ZippedJsonFileName.ToByteArray()); var hasCommonArticle = commonArticlesHashSet.Contains(articleFoundFileName); if (hasCommonArticle) { var isTitleEmpty = String.IsNullOrWhiteSpace(Zipper.Unzip(articleFound.ZippedTitle.ToByteArray())); commonArticlesCopy.List.Add(articleFound); } } commonArticles = commonArticlesCopy; } else { var nonEmptyTitlesList = articlesFound.List .Where(x => !String.IsNullOrWhiteSpace(Zipper.Unzip(x.ZippedTitle.ToByteArray()))); commonArticles.List.AddRange(nonEmptyTitlesList); haveCommonArticlesBeenSet = true; } } } return(commonArticles); }
/// <summary> /// Adds a selected article to dictionary of articles which is periodically saved every reset counter. /// </summary> /// <param name="convertedArticlesIndex">How many .bin.zip packed files are created.</param> /// <param name="resetCounter">Sets maximum amount of packed json articles per bin.zip file.</param> /// <param name="articleFileInfo">File info of JSON article.</param> /// <param name="filesCount">How many JSON files are analysed in total.</param> /// <param name="dictionaryOfJsonArticles">Temporary dictionary of articles periodically saved.</param> private void AddArticle( ref int convertedArticlesIndex, int resetCounter, FileInfo articleFileInfo, string articlesOutputDir, int filesCount, Dictionary <String, String> dictionaryOfJsonArticles) { var baseOutputFileName = "converted_articles_" + convertedArticlesIndex + ".bin"; var articleJsonContent = File.ReadAllText(articleFileInfo.FullName); var articleAbstract = JsonArticleToDocument.GetArticleAbstract(articleJsonContent); var articleTitle = JsonArticleToDocument.GetArticleTitle(articleJsonContent); if (String.IsNullOrWhiteSpace(articleAbstract) && String.IsNullOrWhiteSpace(articleTitle)) { return; } var article = new Article() { ZippedJsonFileName = ByteString.CopyFrom(Zipper.Zip(articleFileInfo.Name)), ZippedBucketFileName = ByteString.CopyFrom(Zipper.Zip(baseOutputFileName + ".zip")), ZippedTitle = ByteString.CopyFrom(Zipper.Zip(articleTitle)), CounterIndex = convertedArticlesIndex }; List <String> keyWords = new List <String>(); if (!String.IsNullOrWhiteSpace(articleAbstract)) { // abstract takes precedence List <String> list = articleAbstract .ToLower() .Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries) .Distinct() .ToList(); keyWords.AddRange(list); } if (!String.IsNullOrWhiteSpace(articleTitle)) { // is followed by title. List <String> list = articleTitle .ToLower() .Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries) .Distinct() .ToList(); keyWords.AddRange(list); } // make sure no repetitions keyWords = keyWords .Distinct() .ToList(); // Process every single key word. foreach (var keyWord in keyWords) { if (keyWord.Length < 3) { // we dont care for less than 3 letters continue; } var firstThreeLetters = keyWord.ToLower().Substring(0, 3); var articlesHubDictionaryKey = firstThreeLetters; if (!ArticlesHubDictionary.Keys.Contains(firstThreeLetters)) { articlesHubDictionaryKey = REST; } var articlesHub = ArticlesHubDictionary[articlesHubDictionaryKey]; articlesHub.AddArticle(keyWord, article); } dictionaryOfJsonArticles[articleFileInfo.Name] = articleJsonContent; if (dictionaryOfJsonArticles.Keys.Count >= resetCounter) { var outputFileName = articlesOutputDir + baseOutputFileName; SerializeDictionary(dictionaryOfJsonArticles, outputFileName); convertedArticlesIndex++; Console.WriteLine("Converted {0} out of {1} articles.", convertedArticlesIndex * resetCounter, filesCount); } }