Beispiel #1
0
        /// <summary>
        /// Searches for the specified query, returns a ranked List of Documents matching any word in the query.
        /// </summary>
        /// <param name="query">The query.</param>
        /// <returns></returns>
        public static List <Document> Search(String query)
        {
            //Separate words, remove punctiations,make lowercase
            List <String> words = Semanter.Splitwords(query, ":").ToList();
            //Obtains possible types searched by this query
            HashSet <String> typesPossible = TypeChecker(words);
            //Stem words and remove stopwords
            //slower method words = words.Except(invt.Stopwords).ToList();
            List <String> splitwords = new List <String>();
            string        stem;

            foreach (string word in words)
            {
                stem = invt.Samantha.StemWord(word);
                if (!(invt.Stopwords.Contains(stem)))
                {
                    splitwords.Add(stem);
                }
            }
            if (splitwords.Count == 0)
            {
                return(new List <Document>());
            }
            //search for documents
            Dictionary <Document, Dictionary <string, List <int> > > Results = DocsFound(splitwords, typesPossible);

            if (Results.Keys.Count < 2)
            {
                return(Results.Keys.ToList());
            }
            return(Ranker.RankQuery(splitwords, Results, invt.DocumentCount));
        }
Beispiel #2
0
 /// <summary>
 /// Initializes a new instance of the <see cref="Inverter" /> class.
 /// </summary>
 /// <param name="StopWords">The path to a File Containing all the stop words.</param>
 /// <param name="DictionaryPath">The path to the file holding all legal words.</param>
 /// <param name="CommonWordsPath">The path to the file holding most coomonly used words.</param>
 /// <param name="FormatsPath">The path to the file holding all supported file types.</param>
 /// <param name="BooksPaths">Paths to Books, used for frequency nalysis of words.</param>
 /// <exception cref="IOException">The specified path could not be Read</exception>
 public Inverter(String StopWords, String DictionaryPath, String CommonWordsPath, String FormatsPath, List <String> BooksPaths)
 {
     Formats = new Dictionary <string, List <string> >();
     AddToFormats(File.ReadAllLines(FormatsPath));
     _samantha = new Semanter(DictionaryPath, CommonWordsPath);
     //tomiwas idea about weight distribution based on file size
     foreach (String BookPath in BooksPaths)
     {
         _samantha.AddToDictionary(BookPath, 1);
     }
     _stopwords     = new HashSet <string>();
     _documentCount = 0;
     this.store     = new Store();
     _documentCount = Files.Count;
     try {
         foreach (String stp in File.ReadAllLines(StopWords))
         {
             _stopwords.Add(stp);
         }
     } catch (Exception ex) {
         throw new IOException("The specified path for stopwords could not be Read", ex);
     }
 }
Beispiel #3
0
 /// <summary>
 /// Modifies the file in the inverted Index Table.
 /// </summary>
 /// <param name="doc">The document that was Modified.</param>
 public static Document ModifyFile(Document doc)
 {
     String[] words = Semanter.Splitwords(x.Extract(doc.Address).Text);
     return(invt.ModifyDocument(words, doc));
 }
Beispiel #4
0
 /// <summary>
 /// Adds words from the Specified Document to the specified Inverted Index Table
 /// </summary>
 /// <param name="doc">The document to be Tokenized.</param>
 /// <exception cref="TextExtractionException">Could not extract Files from the Document</exception>
 public static void AddFileFrom(Document doc)
 {
     String[] words = Semanter.Splitwords(x.Extract(doc.Address).Text);
     invt.AddDocument(words, doc);
 }