Пример #1
0
            public HashSet <CrawlerRegistry.IndexEntry> Execute(CrawlerRegistry registry)
            {
                var removals = new HashSet <CrawlerRegistry.IndexEntry>();
                var entries  = this.Execute(registry, removals);

                Console.WriteLine("[TopLevel] {0} entries, {1} removals.", entries.Count, removals.Count);

                entries.ExceptWith(removals);

                return(entries);
            }
Пример #2
0
        public LinkedList <IndexEntry> Execute(string query, CrawlerRegistry registry, int maxResults = 25, bool usePageRank = false)
        {
            var tokens = new List <string>(query.ToLower().Split(' '));

            tokens.RemoveAll(token => token.Length <= 1 || StopWords.StopWordsList.Contains(token));

            var stemmer       = new PorterStemmer();
            var stemmedTokens = new HashSet <string>(tokens.Select(token => stemmer.StemWord(token.ToLower())));

            return(this.Execute(stemmedTokens.ToList(), registry, maxResults, usePageRank));
        }
Пример #3
0
        protected LinkedList <IndexEntry> Execute(List <string> tokens, CrawlerRegistry registry, int maxResults, bool usePageRank)
        {
            var queryWeights    = this.CalculateQueryWeights(tokens, registry);
            var results         = new LinkedList <IndexEntry>();
            var scores          = new Dictionary <int, double>();
            var documentWeights = new Dictionary <int, Dictionary <string, double> >();
            var totalWeights    = new Dictionary <int, double>();

            foreach (var token in tokens)
            {
                var entries = registry.GetIndexEntries(token);

                foreach (var entry in entries)
                {
                    var weight = 1 + Math.Log10(entry.Frequency);

                    if (!documentWeights.ContainsKey(entry.LinkId))
                    {
                        documentWeights[entry.LinkId] = new Dictionary <string, double>();
                        totalWeights[entry.LinkId]    = 0;
                    }

                    documentWeights[entry.LinkId][token] = weight;
                    totalWeights[entry.LinkId]          += weight * weight;
                }
            }

            foreach (var doc in documentWeights)
            {
                var totalWeight = Math.Sqrt(totalWeights[doc.Key]);
                scores[doc.Key] = doc.Value.Sum(token => (token.Value / totalWeight) * queryWeights[token.Key]);
            }

            var sortedScores =
                scores.OrderByDescending(score => score.Value)
                .Take(usePageRank ? maxResults * 4 : maxResults);

            if (usePageRank)
            {
                sortedScores = sortedScores.OrderByDescending(score => registry.PageRanks[score.Key]).Take(maxResults);
            }

            foreach (var score in sortedScores)
            {
                results.AddLast(new IndexEntry(score.Key, score.Value));
            }

            return(results);
        }
Пример #4
0
        public static void SaveToFile(string fileName, CrawlerRegistry registry)
        {
            var bw = Utilities.GetWriterForFile(fileName);

            // Header
            bw.Write(FileIdent);
            bw.Write(FileVersion);

            // Entries
            bw.Write(registry.Links.Count);
            foreach (var registryLink in registry.Links)
            {
                bw.Write(registryLink.Key);
                registryLink.Value.SaveTo(bw);
            }

            bw.Close();
        }
Пример #5
0
        public static CrawlerRegistry LoadFromFile(string fileName)
        {
            var br       = Utilities.GetReaderForFile(fileName);
            var registry = new CrawlerRegistry();

            if (br == null)
            {
                return(registry);
            }

            // Header
            var ident = br.ReadString();
            var ver   = br.ReadInt32();

            if (!ident.Equals(FileIdent) || ver != FileVersion)
            {
                throw new FileLoadException("Incorrect file format!");
            }

            var maxId      = 0;
            var entryCount = br.ReadInt32();

            for (var i = 0; i < entryCount; i++)
            {
                var key  = br.ReadInt32();
                var link = new CrawlerLink(br);

                registry.Links.Add(key, link);
                registry.AddressLookup[link.Address] = key;

                if (key > maxId)
                {
                    maxId = key;
                }
            }

            registry.LastId       = maxId;
            registry._isDirty     = true;
            registry._hasPageRank = false;

            br.Close();

            return(registry);
        }
Пример #6
0
            protected HashSet <CrawlerRegistry.IndexEntry> Execute(CrawlerRegistry registry, HashSet <CrawlerRegistry.IndexEntry> removals)
            {
                var entries = new HashSet <CrawlerRegistry.IndexEntry>();

                switch (this.QueryType)
                {
                case QueryType.Word:
                    entries = registry.GetIndexEntries(this.Word);
                    break;

                case QueryType.And:
                    entries = this.LeftPart.Execute(registry, removals);

                    if (this.RightPart.QueryType == QueryType.Not)
                    {
                        var tempRemovals = new HashSet <CrawlerRegistry.IndexEntry>();

                        this.RightPart.Execute(registry, tempRemovals);
                        entries.ExceptWith(tempRemovals);
                    }
                    else
                    {
                        entries.IntersectWith(this.RightPart.Execute(registry, removals));
                    }
                    break;

                case QueryType.Or:
                    entries = this.LeftPart.Execute(registry, removals);
                    entries.UnionWith(this.RightPart.Execute(registry, removals));
                    break;

                case QueryType.Not:
                    removals.UnionWith(registry.GetIndexEntries(this.Word));
                    break;
                }

                Console.WriteLine("[SubLevel] {0} entries, {1} removals.", entries.Count, removals.Count);

                return(entries);
            }
Пример #7
0
        protected Dictionary <string, double> CalculateQueryWeights(IEnumerable <string> tokens, CrawlerRegistry registry)
        {
            double totalWeightSquared = 0;
            var    queryWeights       = new Dictionary <string, double>();
            var    localQueryWeights  = new Dictionary <string, double>();

            queryWeights.Clear();

            foreach (var token in tokens)
            {
                if (!registry.Index.ContainsKey(token))
                {
                    queryWeights[token] = -1;
                }
                else
                {
                    var df     = registry.Index[token].Count;
                    var weight = Math.Log10(registry.Index.Count / (double)df);
                    totalWeightSquared      += weight * weight;
                    localQueryWeights[token] = weight;
                }
            }

            totalWeightSquared = Math.Sqrt(totalWeightSquared);

            foreach (var weight in localQueryWeights)
            {
                queryWeights[weight.Key] = weight.Value / totalWeightSquared;
            }

            return(queryWeights);
        }