/// <summary> /// Searches for the specified query, returns a ranked List of Documents matching any word in the query. /// </summary> /// <param name="query">The query.</param> /// <returns></returns> public static List <Document> Search(String query) { //Separate words, remove punctiations,make lowercase List <String> words = Semanter.Splitwords(query, ":").ToList(); //Obtains possible types searched by this query HashSet <String> typesPossible = TypeChecker(words); //Stem words and remove stopwords //slower method words = words.Except(invt.Stopwords).ToList(); List <String> splitwords = new List <String>(); string stem; foreach (string word in words) { stem = invt.Samantha.StemWord(word); if (!(invt.Stopwords.Contains(stem))) { splitwords.Add(stem); } } if (splitwords.Count == 0) { return(new List <Document>()); } //search for documents Dictionary <Document, Dictionary <string, List <int> > > Results = DocsFound(splitwords, typesPossible); if (Results.Keys.Count < 2) { return(Results.Keys.ToList()); } return(Ranker.RankQuery(splitwords, Results, invt.DocumentCount)); }
/// <summary> /// Initializes a new instance of the <see cref="Inverter" /> class. /// </summary> /// <param name="StopWords">The path to a File Containing all the stop words.</param> /// <param name="DictionaryPath">The path to the file holding all legal words.</param> /// <param name="CommonWordsPath">The path to the file holding most coomonly used words.</param> /// <param name="FormatsPath">The path to the file holding all supported file types.</param> /// <param name="BooksPaths">Paths to Books, used for frequency nalysis of words.</param> /// <exception cref="IOException">The specified path could not be Read</exception> public Inverter(String StopWords, String DictionaryPath, String CommonWordsPath, String FormatsPath, List <String> BooksPaths) { Formats = new Dictionary <string, List <string> >(); AddToFormats(File.ReadAllLines(FormatsPath)); _samantha = new Semanter(DictionaryPath, CommonWordsPath); //tomiwas idea about weight distribution based on file size foreach (String BookPath in BooksPaths) { _samantha.AddToDictionary(BookPath, 1); } _stopwords = new HashSet <string>(); _documentCount = 0; this.store = new Store(); _documentCount = Files.Count; try { foreach (String stp in File.ReadAllLines(StopWords)) { _stopwords.Add(stp); } } catch (Exception ex) { throw new IOException("The specified path for stopwords could not be Read", ex); } }
/// <summary> /// Modifies the file in the inverted Index Table. /// </summary> /// <param name="doc">The document that was Modified.</param> public static Document ModifyFile(Document doc) { String[] words = Semanter.Splitwords(x.Extract(doc.Address).Text); return(invt.ModifyDocument(words, doc)); }
/// <summary> /// Adds words from the Specified Document to the specified Inverted Index Table /// </summary> /// <param name="doc">The document to be Tokenized.</param> /// <exception cref="TextExtractionException">Could not extract Files from the Document</exception> public static void AddFileFrom(Document doc) { String[] words = Semanter.Splitwords(x.Extract(doc.Address).Text); invt.AddDocument(words, doc); }