public virtual Document PrepareDocument(string contents, IReadOnlyCollection <string> keywords) { var singleLine = contents.Replace("\r", "").Replace("\n", " "); var cleaned = RemoveSpecialCharacters(singleLine); var lower = cleaned.ToLower(); var splitted = lower.Split(); var stemmer = new PorterStemmer(); var stemmed = splitted.Select(stemmer.StemWord).ToArray(); var vector = this.DocumentToVector(stemmed, keywords); return(new Document(contents, stemmed, vector)); }
public SearchEngine(string[] keywords, IDocumentSanitizer sanitizer) { this.sanitizer = sanitizer; this.stemmer = new PorterStemmer(); this.documents = new List <Document>(); this.keywords = new HashSet <string>(); this.inverseDocumentFrequencies = new List <double>(); this.feedbackCalculator = new RocchioFeedbackCalculator(); foreach (var keyword in keywords) { var stemmed = this.stemmer.StemWord(keyword); this.keywords.Add(stemmed); } this.clusteringStrategy = new KMeansClustering(9, 100, new Random()); }