public void Extract_ShouldReturnTitle() { // Arrange var text = "Kosogłos"; // Act var title = TitleExtractor.Extract(text); // Assert title.Should().Be(text); }
public void Extract_Null_ShouldReturnNull() { // Arrange string text = null; // Act var title = TitleExtractor.Extract(text); // Assert title.Should().BeNull(); }
public void Extract_EmptyText_ShouldReturnNull() { // Arrange var text = string.Empty; // Act var title = TitleExtractor.Extract(text); // Assert title.Should().BeNull(); }
public void Extract_Spaces_ShouldReturnTitle() { // Arrange var expectedTitle = "Kosogłos"; var text = $" {expectedTitle} "; // Act var title = TitleExtractor.Extract(text); // Assert title.Should().Be(expectedTitle); }
public List <Book> ImportBooksList() { var authors = ImportAuthorsList(); var seriesInfos = ImportSeriesListInfo(); var publishingHouses = ImportPublishingHousesList(); var storagePlaces = ImportStoragePlacesList(); var categories = ImportCategoriesList(); var seriesList = seriesInfos .GroupBy(a => a.SeriesName) .Select(a => a.First()) .Where(a => !string.IsNullOrEmpty(a.SeriesName)) .Select(a => a.ToSeries()) .ToList(); List <Book> books = new List <Book>(); foreach (var spreadsheetCatalogData in CatalogData) { var bookCategories = new List <Category>() { CategoryExtractor.Extract(spreadsheetCatalogData.Category) }; bookCategories = bookCategories.Where(a => a != null).ToList(); var bookSeriesInfo = SeriesInfoExtractor.Extract(spreadsheetCatalogData.Series); var book = new Book { Id = Guid.NewGuid(), Title = TitleExtractor.Extract(spreadsheetCatalogData.Title), Authors = AuthorExtractor.Extract(spreadsheetCatalogData.Author), Series = bookSeriesInfo?.ToSeries(), PublishingHouse = PublishingHouseExtractor.Extract(spreadsheetCatalogData.PublishingHouse), PublishmentYear = YearExtractor.Extract(spreadsheetCatalogData.Year), ISBN = IsbnExtractor.Extract(spreadsheetCatalogData.ISBN), Language = LanguageExtractor.Extract(spreadsheetCatalogData.Language), StoragePlace = StoragePlaceExtractor.Extract(spreadsheetCatalogData.StoragePlace), Comment = CommentExtractor.Extract(spreadsheetCatalogData.Comment), Categories = bookCategories, VolumeNumber = bookSeriesInfo?.VolumeNumber }; ImportBookValidator.CheckAuthors(authors, book.Authors); ImportBookValidator.CheckSeries(seriesList, book.Series); ImportBookValidator.CheckPublishingHouse(publishingHouses, book.PublishingHouse); ImportBookValidator.CheckStoragePlace(storagePlaces, book.StoragePlace); ImportBookValidator.CheckCategory(categories, book.Categories); books.Add(book); } return(books); }
public KeywordAnalysis Analyze(string content) { var analysis = new KeywordAnalysis { Content = content }; int wordCount = 0; var titles = TitleExtractor.Extract(content); var paragraphs = WordScraper.ScrapeToParagraphs(content, out wordCount); //flatten list of words var allWords = new List <Word>(); paragraphs.ForEach(p => p.Sentences.ForEach(s => allWords.AddRange(s.Words))); analysis.WordCount = wordCount; analysis.Paragraphs = paragraphs; analysis.Titles = titles; int termTotal = 0; //run through each sentence and grab two and three word segments and add them to the termCount var termOccurrenceCounts = this.GetWordTermOccurence(paragraphs); var termNw = new Dictionary <string, int>(); var termsX2 = new Dictionary <string, decimal>(); //this gets us termsG for frequent terms, and an initialized termsX2 var termsG = this.SortTermsIntoProbabilities(termOccurrenceCounts, ref termsX2, ref termTotal); //now we have to fill termPg and termNw with values var termPg = this.FillTermPgNwCollections(paragraphs, termsG, ref termNw, ref termTotal); //now we have to fill the termFgw collection var termFwg = this.FillTermFwgCollection(paragraphs, termsG); var terms = new string[termsG.Count]; termsG.Values.CopyTo(terms, 0); //gives terms array where last term is the MAX g in G foreach (string w in terms) { decimal sumZ = 0; for (int i = 0; i < terms.Length - 1; i++) //do calcs for all but MAX { string g = terms[i]; if (w != g) //skip where on the diagonal { int nw = termNw[w]; decimal Pg = termPg[g]; decimal D = nw * Pg; if (D != 0.0m) { decimal Fwg = termFwg[w][terms[i]]; decimal T = Fwg - D; decimal Z = T * T / D; sumZ += Z; } } } termsX2[w] = sumZ; } var sortedX2 = new SortedDictionary <decimal, string>(); foreach (var pair in termsX2) { decimal x2 = pair.Value; while (sortedX2.ContainsKey(x2)) { x2 = x2 - 0.00001m; } sortedX2.Add(x2, pair.Key); } //now get simple array of values as lowest to highest X2 terms var x2Terms = new string[sortedX2.Count]; sortedX2.Values.CopyTo(x2Terms, 0); var preres = new Dictionary <string, decimal>(); for (int i = x2Terms.Length - 1; i > -1; i--) { string stemterm = x2Terms[i]; string term = this.GetTermFromStemTerm(allWords, stemterm); if (!preres.ContainsKey(term)) { preres.Add(term, termsX2[x2Terms[i]]); } else { preres[term] = termsX2[x2Terms[i]]; } } //post process title case and caseSpecial words //titles = new Dictionary<string, int>(); //caselist = new Dictionary<string, int>(); //caseListWords -- so we don't have to regex slit the caselist words //for now, case list is going to be left alone since we split those and added them to the sentence end for ranking var tsort = new SortedDictionary <decimal, string>(); foreach (var title in titles) { decimal tscore = 0.0m; var mc = WordScraper.WordReg.Matches(title.Text); foreach (Match m in mc) { if (preres.ContainsKey(m.Value)) { tscore += preres[m.Value]; } } while (tsort.ContainsKey(tscore)) { tscore = tscore - 0.00001m; } tsort.Add(tscore, title.Text); } //mix tsort with preres and return the top 50 foreach (var pre in preres) { decimal x = pre.Value; while (tsort.ContainsKey(x)) { x = x - 0.00001m; } tsort.Add(x, pre.Key); } var result = new Dictionary <string, decimal>(); var resultTerms = new string[tsort.Count]; tsort.Values.CopyTo(resultTerms, 0); var resultValues = new decimal[tsort.Count]; tsort.Keys.CopyTo(resultValues, 0); int max = 0; for (int i = resultTerms.Length - 1; i > -1; i--) { if (!result.ContainsKey(resultTerms[i])) { result.Add(resultTerms[i], resultValues[i]); } //if (max > 50) break; max++; } analysis.Keywords = from n in result select new Keyword { Word = n.Key, Rank = n.Value }; return(analysis); }