예제 #1
0
        public virtual Document PrepareDocument(string contents, IReadOnlyCollection <string> keywords)
        {
            var singleLine = contents.Replace("\r", "").Replace("\n", " ");
            var cleaned    = RemoveSpecialCharacters(singleLine);
            var lower      = cleaned.ToLower();
            var splitted   = lower.Split();
            var stemmer    = new PorterStemmer();
            var stemmed    = splitted.Select(stemmer.StemWord).ToArray();
            var vector     = this.DocumentToVector(stemmed, keywords);

            return(new Document(contents, stemmed, vector));
        }
예제 #2
0
 public SearchEngine(string[] keywords, IDocumentSanitizer sanitizer)
 {
     this.sanitizer = sanitizer;
     this.stemmer   = new PorterStemmer();
     this.documents = new List <Document>();
     this.keywords  = new HashSet <string>();
     this.inverseDocumentFrequencies = new List <double>();
     this.feedbackCalculator         = new RocchioFeedbackCalculator();
     foreach (var keyword in keywords)
     {
         var stemmed = this.stemmer.StemWord(keyword);
         this.keywords.Add(stemmed);
     }
     this.clusteringStrategy = new KMeansClustering(9, 100, new Random());
 }