public void Extract_ShouldReturnTitle()
        {
            // Arrange
            var text = "Kosogłos";
            // Act
            var title = TitleExtractor.Extract(text);

            // Assert
            title.Should().Be(text);
        }
        public void Extract_Null_ShouldReturnNull()
        {
            // Arrange
            string text = null;
            // Act
            var title = TitleExtractor.Extract(text);

            // Assert
            title.Should().BeNull();
        }
        public void Extract_EmptyText_ShouldReturnNull()
        {
            // Arrange
            var text = string.Empty;
            // Act
            var title = TitleExtractor.Extract(text);

            // Assert
            title.Should().BeNull();
        }
Exemple #4
0
 public Unfluffer()
 {
     _htmlParser           = new HtmlParser();
     _languageExtractor    = new LanguageExtractor();
     _titleExtractor       = new TitleExtractor();
     _descriptionExtractor = new DescriptionExtractor();
     _favIconExtractor     = new FavIconExtractor();
     _imageExtractor       = new ImageExtractor();
     _authorExtractor      = new AuthorExtractor();
     _canonicalExtractor   = new CanonicalExtractor();
 }
        public void Extract_Spaces_ShouldReturnTitle()
        {
            // Arrange
            var expectedTitle = "Kosogłos";
            var text          = $" {expectedTitle} ";
            // Act
            var title = TitleExtractor.Extract(text);

            // Assert
            title.Should().Be(expectedTitle);
        }
Exemple #6
0
        public void ShouldExtractFromTitleTag()
        {
            var htmlContent  = string.Format(TitleTagTemplate, "my title");
            var htmlDocument = TestUtilites.CreateHtmlDocument(htmlContent);
            var context      = new ExtractorContext(htmlDocument);

            var extractor = new TitleExtractor();

            extractor.Execute(context);

            context.Title.Should().Be("my title");
        }
Exemple #7
0
        public List <Book> ImportBooksList()
        {
            var authors          = ImportAuthorsList();
            var seriesInfos      = ImportSeriesListInfo();
            var publishingHouses = ImportPublishingHousesList();
            var storagePlaces    = ImportStoragePlacesList();
            var categories       = ImportCategoriesList();

            var seriesList = seriesInfos
                             .GroupBy(a => a.SeriesName)
                             .Select(a => a.First())
                             .Where(a => !string.IsNullOrEmpty(a.SeriesName))
                             .Select(a => a.ToSeries())
                             .ToList();
            List <Book> books = new List <Book>();

            foreach (var spreadsheetCatalogData in CatalogData)
            {
                var bookCategories = new List <Category>()
                {
                    CategoryExtractor.Extract(spreadsheetCatalogData.Category)
                };
                bookCategories = bookCategories.Where(a => a != null).ToList();
                var bookSeriesInfo = SeriesInfoExtractor.Extract(spreadsheetCatalogData.Series);

                var book = new Book {
                    Id              = Guid.NewGuid(),
                    Title           = TitleExtractor.Extract(spreadsheetCatalogData.Title),
                    Authors         = AuthorExtractor.Extract(spreadsheetCatalogData.Author),
                    Series          = bookSeriesInfo?.ToSeries(),
                    PublishingHouse = PublishingHouseExtractor.Extract(spreadsheetCatalogData.PublishingHouse),
                    PublishmentYear = YearExtractor.Extract(spreadsheetCatalogData.Year),
                    ISBN            = IsbnExtractor.Extract(spreadsheetCatalogData.ISBN),
                    Language        = LanguageExtractor.Extract(spreadsheetCatalogData.Language),
                    StoragePlace    = StoragePlaceExtractor.Extract(spreadsheetCatalogData.StoragePlace),
                    Comment         = CommentExtractor.Extract(spreadsheetCatalogData.Comment),
                    Categories      = bookCategories,
                    VolumeNumber    = bookSeriesInfo?.VolumeNumber
                };

                ImportBookValidator.CheckAuthors(authors, book.Authors);
                ImportBookValidator.CheckSeries(seriesList, book.Series);
                ImportBookValidator.CheckPublishingHouse(publishingHouses, book.PublishingHouse);
                ImportBookValidator.CheckStoragePlace(storagePlaces, book.StoragePlace);
                ImportBookValidator.CheckCategory(categories, book.Categories);
                books.Add(book);
            }

            return(books);
        }
Exemple #8
0
        public void ShouldExtractFromOpenGraph(string title, string expectedTitle)
        {
            var htmlDocument = TestUtilites.CreateHtmlDocument("<html></hmtl>");
            var context      = new ExtractorContext(htmlDocument)
            {
                OpenGraph = new Dictionary <string, string>()
                {
                    { "title", title },
                    { "site_name", "hi" }
                },
                Domain = "www.example.com"
            };

            var extractor = new TitleExtractor();

            extractor.Execute(context);

            context.Title.Should().Be(expectedTitle);
        }
        public KeywordAnalysis Analyze(string content)
        {
            var analysis = new KeywordAnalysis {
                Content = content
            };
            int wordCount  = 0;
            var titles     = TitleExtractor.Extract(content);
            var paragraphs = WordScraper.ScrapeToParagraphs(content, out wordCount);

            //flatten list of words
            var allWords = new List <Word>();

            paragraphs.ForEach(p => p.Sentences.ForEach(s => allWords.AddRange(s.Words)));

            analysis.WordCount  = wordCount;
            analysis.Paragraphs = paragraphs;
            analysis.Titles     = titles;

            int termTotal = 0;

            //run through each sentence and grab two and three word segments and add them to the termCount
            var termOccurrenceCounts = this.GetWordTermOccurence(paragraphs);

            var termNw  = new Dictionary <string, int>();
            var termsX2 = new Dictionary <string, decimal>();

            //this gets us termsG for frequent terms, and an initialized termsX2
            var termsG = this.SortTermsIntoProbabilities(termOccurrenceCounts, ref termsX2, ref termTotal);

            //now we have to fill termPg and termNw with values
            var termPg = this.FillTermPgNwCollections(paragraphs, termsG, ref termNw, ref termTotal);

            //now we have to fill the termFgw collection
            var termFwg = this.FillTermFwgCollection(paragraphs, termsG);

            var terms = new string[termsG.Count];

            termsG.Values.CopyTo(terms, 0); //gives terms array where last term is the MAX g in G

            foreach (string w in terms)
            {
                decimal sumZ = 0;

                for (int i = 0; i < terms.Length - 1; i++) //do calcs for all but MAX
                {
                    string g = terms[i];

                    if (w != g) //skip where on the diagonal
                    {
                        int     nw = termNw[w];
                        decimal Pg = termPg[g];
                        decimal D  = nw * Pg;

                        if (D != 0.0m)
                        {
                            decimal Fwg = termFwg[w][terms[i]];
                            decimal T   = Fwg - D;
                            decimal Z   = T * T / D;
                            sumZ += Z;
                        }
                    }
                }

                termsX2[w] = sumZ;
            }

            var sortedX2 = new SortedDictionary <decimal, string>();

            foreach (var pair in termsX2)
            {
                decimal x2 = pair.Value;

                while (sortedX2.ContainsKey(x2))
                {
                    x2 = x2 - 0.00001m;
                }

                sortedX2.Add(x2, pair.Key);
            }

            //now get simple array of values as lowest to highest X2 terms
            var x2Terms = new string[sortedX2.Count];

            sortedX2.Values.CopyTo(x2Terms, 0);

            var preres = new Dictionary <string, decimal>();

            for (int i = x2Terms.Length - 1; i > -1; i--)
            {
                string stemterm = x2Terms[i];
                string term     = this.GetTermFromStemTerm(allWords, stemterm);

                if (!preres.ContainsKey(term))
                {
                    preres.Add(term, termsX2[x2Terms[i]]);
                }
                else
                {
                    preres[term] = termsX2[x2Terms[i]];
                }
            }

            //post process title case and caseSpecial words
            //titles = new Dictionary<string, int>();
            //caselist = new Dictionary<string, int>();
            //caseListWords -- so we don't have to regex slit the caselist words
            //for now, case list is going to be left alone since we split those and added them to the sentence end for ranking
            var tsort = new SortedDictionary <decimal, string>();

            foreach (var title in titles)
            {
                decimal tscore = 0.0m;
                var     mc     = WordScraper.WordReg.Matches(title.Text);

                foreach (Match m in mc)
                {
                    if (preres.ContainsKey(m.Value))
                    {
                        tscore += preres[m.Value];
                    }
                }

                while (tsort.ContainsKey(tscore))
                {
                    tscore = tscore - 0.00001m;
                }

                tsort.Add(tscore, title.Text);
            }

            //mix tsort with preres and return the top 50
            foreach (var pre in preres)
            {
                decimal x = pre.Value;

                while (tsort.ContainsKey(x))
                {
                    x = x - 0.00001m;
                }

                tsort.Add(x, pre.Key);
            }

            var result      = new Dictionary <string, decimal>();
            var resultTerms = new string[tsort.Count];

            tsort.Values.CopyTo(resultTerms, 0);
            var resultValues = new decimal[tsort.Count];

            tsort.Keys.CopyTo(resultValues, 0);
            int max = 0;

            for (int i = resultTerms.Length - 1; i > -1; i--)
            {
                if (!result.ContainsKey(resultTerms[i]))
                {
                    result.Add(resultTerms[i], resultValues[i]);
                }

                //if (max > 50) break;
                max++;
            }

            analysis.Keywords = from n in result select new Keyword {
                Word = n.Key, Rank = n.Value
            };

            return(analysis);
        }