/// <summary> /// Runs the content summarizing part of the summarizing algorithm /// </summary> /// <param name="analyzedDocument"></param> /// <param name="contentSummarizer"></param> /// <param name="arguments"></param> /// <returns></returns> public SummarizedDocument SummarizeAnalyzedContent(AnalyzedDocument analyzedDocument, IContentSummarizer contentSummarizer, ISummarizerArguments arguments) { if (analyzedDocument == null) { throw new ArgumentNullException(nameof(analyzedDocument)); } if (contentSummarizer == null) { throw new ArgumentNullException(nameof(contentSummarizer)); } if (arguments == null) { throw new ArgumentNullException(nameof(arguments)); } // Range adjustment if (arguments.FilteringConceptsCap < 0) { arguments.FilteringConceptsCap = 0; } if (arguments.MaxSummarySentences < 0) { arguments.MaxSummarySentences = 0; } if (arguments.MaxSummarySizeInPercent < 0) { arguments.MaxSummarySizeInPercent = 0; } if (arguments.MaxSummarySizeInPercent > 100) { arguments.MaxSummarySizeInPercent = 100; } List <string> summarizedConcepts = contentSummarizer.GetConcepts(analyzedDocument, arguments); if (summarizedConcepts == null) { throw new InvalidOperationException($"{contentSummarizer.GetType().FullName}.GetConcepts must not return null"); } List <string> summarizedSentences = contentSummarizer.GetSentences(analyzedDocument, arguments); if (summarizedSentences == null) { throw new InvalidOperationException($"{contentSummarizer.GetType().FullName}.GetSentences must not return null"); } return(new SummarizedDocument { Concepts = summarizedConcepts, Sentences = summarizedSentences }); }
public List <string> GetConcepts(AnalyzedDocument analyzedDocument, ISummarizerArguments summarizerArguments) { if (analyzedDocument.ScoredTextUnits.Count <= summarizerArguments.FilteringConceptsCap) { return(analyzedDocument.ScoredTextUnits.Select(tus => tus.ScoredTextUnit.FormattedValue).ToList()); } double baseFrequency = analyzedDocument.ScoredTextUnits[summarizerArguments.FilteringConceptsCap].Score; return(analyzedDocument.ScoredTextUnits.Where(tus => tus.Score >= baseFrequency).Select(tus => tus.ScoredTextUnit.FormattedValue).ToList()); }
public static SummarizedDocument Summarize(IContentProvider contentProvider, ISummarizerArguments args) { if (contentProvider == null || args == null) { return(new SummarizedDocument()); } SummarizingEngine engine = new SummarizingEngine(); ParsedDocument parsedDocument = engine.ParseContent(contentProvider, args.ContentParser()); AnalyzedDocument analyzedDocument = engine.AnalyzeParsedContent(parsedDocument, args.ContentAnalyzer()); SummarizedDocument summaryAnalysisDocument = engine.SummarizeAnalyzedContent(analyzedDocument, args.ContentSummarizer(), args); return(summaryAnalysisDocument); }
public List <string> GetSentences(AnalyzedDocument analyzedDocument, ISummarizerArguments summarizerArguments) { int totalContentWordCount = analyzedDocument.ScoredSentences.Sum(s => s.ScoredSentence.TextUnits.Count); int targetWordCount = summarizerArguments.MaxSummarySizeInPercent * totalContentWordCount / 100; int currentWordCount = 0; int currentSentenceIndex = 0; List <Sentence> selectedSentences = new List <Sentence>(); while (currentSentenceIndex < analyzedDocument.ScoredSentences.Count - 1 && selectedSentences.Count < summarizerArguments.MaxSummarySentences && currentWordCount < targetWordCount) { Sentence selectedSentence = analyzedDocument.ScoredSentences[currentSentenceIndex].ScoredSentence; selectedSentences.Add(selectedSentence); currentWordCount += selectedSentence.TextUnits.Count(); currentSentenceIndex += 1; } return(selectedSentences.OrderBy(s => s.OriginalSentenceIndex).Select(s => s.OriginalSentence).ToList()); }