public HashSet <CrawlerRegistry.IndexEntry> Execute(CrawlerRegistry registry) { var removals = new HashSet <CrawlerRegistry.IndexEntry>(); var entries = this.Execute(registry, removals); Console.WriteLine("[TopLevel] {0} entries, {1} removals.", entries.Count, removals.Count); entries.ExceptWith(removals); return(entries); }
public LinkedList <IndexEntry> Execute(string query, CrawlerRegistry registry, int maxResults = 25, bool usePageRank = false) { var tokens = new List <string>(query.ToLower().Split(' ')); tokens.RemoveAll(token => token.Length <= 1 || StopWords.StopWordsList.Contains(token)); var stemmer = new PorterStemmer(); var stemmedTokens = new HashSet <string>(tokens.Select(token => stemmer.StemWord(token.ToLower()))); return(this.Execute(stemmedTokens.ToList(), registry, maxResults, usePageRank)); }
protected LinkedList <IndexEntry> Execute(List <string> tokens, CrawlerRegistry registry, int maxResults, bool usePageRank) { var queryWeights = this.CalculateQueryWeights(tokens, registry); var results = new LinkedList <IndexEntry>(); var scores = new Dictionary <int, double>(); var documentWeights = new Dictionary <int, Dictionary <string, double> >(); var totalWeights = new Dictionary <int, double>(); foreach (var token in tokens) { var entries = registry.GetIndexEntries(token); foreach (var entry in entries) { var weight = 1 + Math.Log10(entry.Frequency); if (!documentWeights.ContainsKey(entry.LinkId)) { documentWeights[entry.LinkId] = new Dictionary <string, double>(); totalWeights[entry.LinkId] = 0; } documentWeights[entry.LinkId][token] = weight; totalWeights[entry.LinkId] += weight * weight; } } foreach (var doc in documentWeights) { var totalWeight = Math.Sqrt(totalWeights[doc.Key]); scores[doc.Key] = doc.Value.Sum(token => (token.Value / totalWeight) * queryWeights[token.Key]); } var sortedScores = scores.OrderByDescending(score => score.Value) .Take(usePageRank ? maxResults * 4 : maxResults); if (usePageRank) { sortedScores = sortedScores.OrderByDescending(score => registry.PageRanks[score.Key]).Take(maxResults); } foreach (var score in sortedScores) { results.AddLast(new IndexEntry(score.Key, score.Value)); } return(results); }
public static void SaveToFile(string fileName, CrawlerRegistry registry) { var bw = Utilities.GetWriterForFile(fileName); // Header bw.Write(FileIdent); bw.Write(FileVersion); // Entries bw.Write(registry.Links.Count); foreach (var registryLink in registry.Links) { bw.Write(registryLink.Key); registryLink.Value.SaveTo(bw); } bw.Close(); }
public static CrawlerRegistry LoadFromFile(string fileName) { var br = Utilities.GetReaderForFile(fileName); var registry = new CrawlerRegistry(); if (br == null) { return(registry); } // Header var ident = br.ReadString(); var ver = br.ReadInt32(); if (!ident.Equals(FileIdent) || ver != FileVersion) { throw new FileLoadException("Incorrect file format!"); } var maxId = 0; var entryCount = br.ReadInt32(); for (var i = 0; i < entryCount; i++) { var key = br.ReadInt32(); var link = new CrawlerLink(br); registry.Links.Add(key, link); registry.AddressLookup[link.Address] = key; if (key > maxId) { maxId = key; } } registry.LastId = maxId; registry._isDirty = true; registry._hasPageRank = false; br.Close(); return(registry); }
protected HashSet <CrawlerRegistry.IndexEntry> Execute(CrawlerRegistry registry, HashSet <CrawlerRegistry.IndexEntry> removals) { var entries = new HashSet <CrawlerRegistry.IndexEntry>(); switch (this.QueryType) { case QueryType.Word: entries = registry.GetIndexEntries(this.Word); break; case QueryType.And: entries = this.LeftPart.Execute(registry, removals); if (this.RightPart.QueryType == QueryType.Not) { var tempRemovals = new HashSet <CrawlerRegistry.IndexEntry>(); this.RightPart.Execute(registry, tempRemovals); entries.ExceptWith(tempRemovals); } else { entries.IntersectWith(this.RightPart.Execute(registry, removals)); } break; case QueryType.Or: entries = this.LeftPart.Execute(registry, removals); entries.UnionWith(this.RightPart.Execute(registry, removals)); break; case QueryType.Not: removals.UnionWith(registry.GetIndexEntries(this.Word)); break; } Console.WriteLine("[SubLevel] {0} entries, {1} removals.", entries.Count, removals.Count); return(entries); }
protected Dictionary <string, double> CalculateQueryWeights(IEnumerable <string> tokens, CrawlerRegistry registry) { double totalWeightSquared = 0; var queryWeights = new Dictionary <string, double>(); var localQueryWeights = new Dictionary <string, double>(); queryWeights.Clear(); foreach (var token in tokens) { if (!registry.Index.ContainsKey(token)) { queryWeights[token] = -1; } else { var df = registry.Index[token].Count; var weight = Math.Log10(registry.Index.Count / (double)df); totalWeightSquared += weight * weight; localQueryWeights[token] = weight; } } totalWeightSquared = Math.Sqrt(totalWeightSquared); foreach (var weight in localQueryWeights) { queryWeights[weight.Key] = weight.Value / totalWeightSquared; } return(queryWeights); }