Exemple #1
0
        /// <summary>
        /// Weights all categories and subcategories by the number of search results,
        /// sorts the XML by that and saves search results with content snippets in the subcategories,
        /// that do actually appear in the index pages.
        /// </summary>
        /// <param name="path">The path to the directory where the XML file is in.</param>
        /// <param name="proxy">The proxy.</param>
        public static void CreateWeightsAndSaveSearchResults(string path, RCLocalProxy proxy)
        {
            proxy.Logger.Info("Ontology: Weighting ontology. This can take several minutes or hours.");
            string xmlFileName = path + IndexServer.CLUSTERS_XML_FILE_NAME;
            XmlDocument xmlDoc = IndexServer.GetClustersXMLDocument(xmlFileName);
            lock (xmlDoc)
            {
                XmlElement rootXml = xmlDoc.DocumentElement;

                if (rootXml == null)
                {
                    proxy.Logger.Warn("Ontology: No proper clusters.xml with ontology. Aborting weighting.");
                    return;
                }

                proxy.Logger.Debug("Ontology: Step 1/3: Getting number of cached items.");
                int limit = proxy.ProxyCacheManager.CachedItems();
                if (limit == 0)
                {
                    limit = 1;
                }
                int i = 1;
                foreach (XmlElement categoryElement in rootXml.ChildNodes)
                {
                    proxy.Logger.Debug(String.Format("Ontology: Step 2/3: Calculating weights for category ({0}/{1}): {2}",
                      i, rootXml.ChildNodes.Count, categoryElement.GetAttribute(IndexServer.INDEX_FEATURES_XML_ATTR)));
                    // Determine the weight for the category and all subcategories
                    DetermineWeight(categoryElement, proxy, limit);
                    foreach (XmlElement subcategoryElement in categoryElement.ChildNodes)
                    {
                        DetermineWeight(subcategoryElement, proxy, limit);
                    }
                    i++;
                }
                SortByWeight(rootXml);

                // Getting search results for all subcategories visible on the index page.
                for (int catNo = 0; catNo < Math.Min(IndexServer.NUMBER_OF_CATEGORIES, rootXml.ChildNodes.Count); catNo++)
                {
                    proxy.Logger.Debug(String.Format("Ontology: Step 3/3: Getting number of cached items for category ({0}/{1})",
                        catNo + 1, IndexServer.NUMBER_OF_CATEGORIES));
                    for (int subcatNo = 0; subcatNo < Math.Min(IndexServer.NUMBER_OF_SUBCATEGORIES, rootXml.ChildNodes[catNo].ChildNodes.Count); subcatNo++)
                    {
                        AppendSearchResults(rootXml.ChildNodes[catNo].ChildNodes[subcatNo] as XmlElement, proxy, IndexServer.NUMBER_OF_LINKS);
                    }
                }

                // Set timestamp for the new clusters.xml
                rootXml.SetAttribute("time", "" + DateTime.Now.ToFileTime());

                // Save new xml
                xmlDoc.Save(xmlFileName);
            }

            proxy.Logger.Info("Ontology: Finished successfully.");
        }
Exemple #2
0
 /// <summary>
 /// Appends lucene search results to a subcategory element.
 /// </summary>
 /// <param name="subCategoryElement">The element.</param>
 /// <param name="proxy">The proxy.</param>
 /// <param name="numberOfResults">The maximum nunber of results to add.</param>
 private static void AppendSearchResults(XmlElement subCategoryElement, RCLocalProxy proxy, int numberOfResults)
 {
     string title = subCategoryElement.GetAttribute(IndexServer.INDEX_FEATURES_XML_ATTR);
     // Do a Lucene search
     SearchResults luceneResults = proxy.IndexWrapper.Query(
         title, proxy.CachePath, 0, numberOfResults, true, -1);
     // Remove current children
     subCategoryElement.RemoveAllChilds();
     // Add the results to the XML
     LocalInternalRequestHandler.AppendSearchResultsXMLElements(luceneResults, subCategoryElement.OwnerDocument, subCategoryElement);
 }
Exemple #3
0
        /// <summary>
        /// Creates the clusters.
        /// 
        /// </summary>
        /// <param name="k">The number of clusters to create.</param>
        /// <param name="catNFeatures">The maximum number of features for a category.</param>
        /// <param name="subcatNFeatures">The maximum number of features for a subcategory.</param>
        /// <param name="hierarchical">If the clusters should be organized hierarchical.</param>
        /// <param name="maxCategories">The maximum number of categories.</param>
        /// <param name="clustersPath">The path to the clusters folder.</param>
        /// <param name="proxy">The proxy.</param>
        public static void CreateClusters(int k, int catNFeatures, int subcatNFeatures, bool hierarchical,
            int maxCategories, string clustersPath, RCLocalProxy proxy)
        {
            proxy.Logger.Info("Clustering: Creating clusters. This may take around an hour!");
            // Measure what part takes what time
            Stopwatch stopwatch = new Stopwatch();

            // Filenames
            string docFileName = clustersPath + DOC_FILE_NAME;
            string matFileName = clustersPath + MAT_FILE_NAME;
            string clustersFileName = clustersPath + CLUSTERS_FILE_NAME;
            string xmlBTFileName = clustersPath + CLUSTERS_BT_XML_FILE_NAME;
            string xmlFileName = clustersPath + IndexServer.CLUSTERS_XML_FILE_NAME;

            // get files
            proxy.Logger.Debug("Clustering (1/6): Getting all text files.");
            stopwatch.Start();
            List<string> textFiles = proxy.ProxyCacheManager.TextFiles();
            stopwatch.Stop();
            proxy.Logger.Debug("Custering (1/6): Getting all text files took: " + stopwatch.Elapsed.TotalSeconds + "s");

            // Abort if we're having less than 2 text files
            if (textFiles.Count < 2)
            {
                proxy.Logger.Debug("Clustering: Less than 2 text files, aborting.");
                return;
            }
            // List number of text files
            proxy.Logger.Debug(String.Format("Clustering (1/6): Using {0} text files.", textFiles.Count));

            List<string> titles;
            // files2doc
            proxy.Logger.Debug("Clustering (2/6): Creating docfile.");
            stopwatch.Restart();
            try
            {
                titles = Cluster.CreateDocFile(textFiles, docFileName);
            }
            catch (IOException e)
            {
                proxy.Logger.Warn("Clustering: DocFile creation failed.", e);
                return;
            }
            stopwatch.Stop();
            proxy.Logger.Debug("Custering (2/6): Creating docfile took: " + stopwatch.Elapsed.TotalSeconds + "s");

            // doc2mat
            proxy.Logger.Debug("Clustering (3/6): Doc2Mat.");
            stopwatch.Restart();
            try
            {
                Doc2Mat.DoDoc2Mat(docFileName, matFileName);
            }
            catch (Exception e)
            {
                proxy.Logger.Warn("Clustering: Doc2Mat failed.", e);
                return;
            }
            stopwatch.Stop();
            proxy.Logger.Debug("Custering (3/6): Doc2Mat took: " + stopwatch.Elapsed.TotalSeconds + "s");

            // ClutoClusters
            proxy.Logger.Debug("Clustering (4/6): Cluto-Clustering.");
            string treeFileName = null;
            HashSet<string>[] features;
            stopwatch.Restart();
            try
            {
                if (hierarchical)
                {
                    treeFileName = clustersPath + TREE_FILE_NAME;
                    features = Cluster.CreateClusters(matFileName, clustersFileName, k, true, treeFileName,
                        catNFeatures, subcatNFeatures);
                }
                else
                {
                    features = Cluster.CreateClusters(matFileName, clustersFileName, k, false, "",
                        catNFeatures, subcatNFeatures);
                }
            }
            catch (Exception e)
            {
                proxy.Logger.Warn("Clustering: Cluto failed.", e);
                return;
            }
            stopwatch.Stop();
            proxy.Logger.Debug("Custering (4/6): Cluto-Clustering took: " + stopwatch.Elapsed.TotalSeconds + "s");

            // Create binary tree XML file
            proxy.Logger.Debug("Clustering (5/6): Creating clustersBT.xml.");
            stopwatch.Restart();
            try
            {
                Cluster.CreateClusterBTXMLFile(textFiles, features, clustersFileName, (hierarchical ? treeFileName : ""),
                    xmlBTFileName, k, proxy.CachePath.Length, titles);
            }
            catch (Exception e)
            {
                proxy.Logger.Warn("Clustering: Creating XML failed.", e);
                return;
            }
            stopwatch.Stop();
            proxy.Logger.Debug("Clustering (5/6): Creating clustersBT.xml took " + stopwatch.Elapsed.TotalSeconds + " s");

            // Create XML file
            proxy.Logger.Debug("Clustering (6/6): Creating clusters.xml.");
            stopwatch.Restart();
            try
            {
                Cluster.CreateClusterXMLFile(xmlFileName, xmlBTFileName, maxCategories);
            }
            catch (Exception e)
            {
                proxy.Logger.Error("Clustering: Creating clusters.xml failed.", e);
                return;
            }
            stopwatch.Stop();
            proxy.Logger.Debug("Custering (6/6): Creating clusters.xml took: " + stopwatch.Elapsed.TotalSeconds + "s");

            proxy.Logger.Info("Clustering: Finished successfully.");
        }
Exemple #4
0
 /// <summary>
 /// Determines the weight for a (sub)category.
 /// </summary>
 /// <param name="element">The XML element</param>
 /// <param name="limit">The upper limit for number of search results, which is used as weight.</param>
 /// <param name="proxy">The proxy.</param>
 private static void DetermineWeight(XmlElement element, RCLocalProxy proxy, int limit)
 {
     string title = element.GetAttribute(IndexServer.INDEX_FEATURES_XML_ATTR);
     int weight = proxy.IndexWrapper.NumberOfResults(title, limit);
     // Set weight
     element.SetAttribute(IndexServer.INDEX_WEIGHT_XML_ATTR, "" + weight);
 }
Exemple #5
0
        /// <summary>
        /// Computes the 3rd level in the hierarchy for a given category and subcategory.
        /// </summary>
        /// <param name="clusterXMLFile">The path to clusters.xml</param>
        /// <param name="categoryId">The category id.</param>
        /// <param name="subCategoryId">The subcategory id.</param>
        /// <param name="proxy">Proxy access to conduct a Lucene search.</param>
        /// <returns>The index.xml string.</returns>
        public static string Level3Index(string clusterXMLFile, string categoryId, string subCategoryId, RCLocalProxy proxy)
        {
            XmlDocument clustersDoc = GetClustersXMLDocument(clusterXMLFile);

            XmlDocument indexDoc = new XmlDocument();
            indexDoc.AppendChild(indexDoc.CreateXmlDeclaration("1.0", "UTF-8", String.Empty));

            XmlElement indexRootXml = indexDoc.CreateElement(INDEX_CATEGORIES_XML_NAME);
            indexDoc.AppendChild(indexRootXml);
            indexRootXml.SetAttribute(INDEX_LEVEL_XML_ATTR, String.Empty + 3);

            // Find category and subcategory element
            XmlElement categoryElement, subCategoryElement;
            categoryElement = FindCategory(clustersDoc.DocumentElement, categoryId);
            if (categoryElement == null)
            {
                throw new ArgumentException("Could not find category with that id.");
            }
            subCategoryElement = FindCategory(categoryElement, subCategoryId);
            if (subCategoryElement == null)
            {
                throw new ArgumentException("Could not find subcategory with that id.");
            }

            // Import category
            XmlNode category = indexRootXml.AppendChild(indexDoc.ImportNode(categoryElement, false));
            // Import subcategory
            XmlNode subCategory = category.AppendChild(indexDoc.ImportNode(subCategoryElement, false));
            if (subCategoryElement.ChildNodes.Count == 0)
            {
                // Do a Lucene search, if there are no items.
                SearchResults luceneResults = proxy.IndexWrapper.Query(
                    subCategoryElement.GetAttribute(INDEX_FEATURES_XML_ATTR),
                    proxy.CachePath, 0, NUMBER_OF_LINKS, true, -1);

                // Add the results to the XML
                LocalInternalRequestHandler.AppendSearchResultsXMLElements(luceneResults, indexDoc, subCategory as XmlElement);
            }
            else
            {
                // Import up to maxItems items
                for (int i = 0; i < subCategoryElement.ChildNodes.Count && (NUMBER_OF_LINKS == 0 || i < NUMBER_OF_LINKS); i++)
                {
                    subCategory.AppendChild(indexDoc.ImportNode(subCategoryElement.ChildNodes[i], true));
                }
            }

            return indexDoc.InnerXml;
        }
Exemple #6
0
        /// <summary>
        /// Removes all dead links from the index.
        /// </summary>
        /// <param name="proxy">The proxy, to log and to gain access to the cache manager.</param>
        public void RemoveAllDeadLinks(RCLocalProxy proxy)
        {
            proxy.Logger.Info(String.Format("The index contains {0} documents.",
                        IndexItemCount()));

            proxy.Logger.Info("Deleting all dead links from index...");
            Lucene.Net.Store.FSDirectory directory = Lucene.Net.Store.FSDirectory.Open(new System.IO.DirectoryInfo(_indexPath));
            IndexReader reader = IndexReader.Open(directory, true);
            IndexWriter writer = new IndexWriter(_indexPath, _analyzer, false);

            int deleted = 0;
            int cacheFailures = 0;

            for (int i = 0; i < reader.MaxDoc(); i++)
            {
                if (reader.IsDeleted(i))
                {
                    continue;
                }
                proxy.Logger.Debug(i + " files scanned out of: " + reader.MaxDoc());
                Document doc = reader.Document(i);

                string uri = doc.Get("uri");
                string relFileName = CacheManager.GetRelativeCacheFileName(uri, "GET");
                string absFileName = proxy.ProxyCacheManager.CachePath + relFileName;

                if (!proxy.ProxyCacheManager.IsCached(relFileName))
                {
                    if (File.Exists(absFileName))
                    {
                        cacheFailures++;
                        proxy.Logger.Warn(String.Format(
                            "Cache Failure {0}. Not cached but file exists: {1}",
                        cacheFailures, relFileName));

                        NameValueCollection headers = new NameValueCollection()
                        {
                            // We need to include content-type, as we always want that header!
                            { "Content-Type", "text/html"}
                        };

                        // The index entry already exists, so we don't want to create a new one here.
                        GlobalCacheItemToAdd newItem = new GlobalCacheItemToAdd(relFileName, headers, 200, false);

                        // Add file to the database
                        proxy.ProxyCacheManager.AddCacheItemsForExistingFiles(new HashSet<GlobalCacheItemToAdd>() { newItem });
                        continue;
                    }

                     deleted++;
                     proxy.Logger.Info(String.Format("This is doc number {0} we delete. Deleting {1} from the lucene index.",
                        deleted, relFileName));
                    writer.DeleteDocuments(new Term("uri", uri));
                    if (deleted % 100 == 0)
                    {
                        writer.Commit();
                    }
                }
            }

            writer.Commit();
            reader.Close();
            writer.Close();
            proxy.Logger.Info(String.Format(
                "Deleted all dead links from index. Deleted {0} index items and added {1} DB items.",
                deleted, cacheFailures));
        }
Exemple #7
0
 /// <summary>
 /// Instantiate a new CrawlerWrapper.
 /// </summary>
 /// <param name="proxy">The local proxy.</param>
 public CrawlerWrapper(RCLocalProxy proxy)
 {
     this._proxy = proxy;
 }
        /// <summary>
        /// Tests ALL link suggestions in LS_DEBUG mode. Should only be used for small caches.
        /// </summary>
        /// <param name="proxy">The local proxy.</param>
        public static void DebugLinkSuggestion(RCLocalProxy proxy)
        {
            CacheManager cm = proxy.ProxyCacheManager;
            string[] stopwords = File.ReadAllLines("stopwords.txt");

            List<string> sites = cm.TextFiles();
            int i = 1;
            foreach (string site in sites)
            {
                proxy.Logger.Debug(String.Format("Working site {0} of {1}", i++, sites.Count));
                string url = cm.AbsoluteFilePathToUri(site);

                string content = Utils.ReadFileAsString(site);
                HtmlDocument doc = new HtmlDocument();
                doc.LoadHtml(content);

                HtmlNode head = doc.DocumentNode.SelectSingleNode("/html/head");
                HtmlNode body = doc.DocumentNode.SelectSingleNode("/html/body");
                if (head == null || body == null)
                {
                    // We haven't sane HTML, just continue
                    continue;
                }

                // Gets all links containing text.
                HtmlNodeCollection links = doc.DocumentNode.SelectNodes("//a[not(node()[2])][text()]/@href");
                if (links != null)
                {
                    int ign;
                    foreach (HtmlNode link in links)
                    {
                        string relTarget = link.GetAttributeValue("href", "");
                        string target;
                        try
                        {
                            target = new Uri(new Uri(url), relTarget).ToString();
                        }
                        catch (UriFormatException)
                        {
                            continue;
                        }

                        if (LINK_ANCHOR_BLACKLIST.Contains(link.InnerText.ToLower())
                            || Int32.TryParse(link.InnerText, out ign) || url.ToLower().Equals(target.ToLower()))
                        {
                            // No "Here", ... links or number links
                            // No links to the same page! (LS_DEBUG)
                            continue;
                        }
                        string relFileName = CacheManager.GetRelativeCacheFileName(target, "GET");

                        if (!target.Equals("") && cm.IsCached(relFileName))
                        {
                            // Get anchor and surrounding text
                            string anchorText = link.InnerText;
                            string surroundingText = HtmlUtils.GetSurroundingText(link,
                                stopwords, false);

                            // Trigger LS
                            GetLinkSuggestions(target, url, anchorText, surroundingText,
                                3, proxy);
                        }
                    }
                }
            }
        }
        /// <summary>
        /// Gets the link suggestions for an uncached link.
        /// </summary>
        /// <param name="url">The absolute URL.</param>
        /// <param name="refUrl">The referer URL.</param>
        /// <param name="anchorText">The anchor text.</param>
        /// <param name="surroundingText">The surrounding text.</param>
        /// <param name="amount">The amount of suggestions to get.</param>
        /// <param name="proxy">The local proxy.</param>
        /// <returns>The suggestions.</returns>
        public static IEnumerable<SearchResult> GetLinkSuggestions(string url, string refUrl, string anchorText,
            string surroundingText, int amount, RCLocalProxy proxy)
        {
            // Remove all http:// or https:// from the query
            string url0 = url.Replace("http://", "").Replace("https://", "");
            string refUrl0 = refUrl.Replace("http://", "").Replace("https://", "");
            string anchorText0 = anchorText.Replace("http://", "").Replace("https://", "");
            string surroundingText0 = surroundingText.Replace("http://", "").Replace("https://", "");

            // If we're debugging, we want 51 results, otherwise
            // we want one result more, as we're very probably going to find the referrer page
            int amount0 = Properties.Network.Default.LS_DEBUG ? LS_DEBUG_NUM_RESULTS : amount + 1;

            SearchResults luceneResults = proxy.IndexWrapper.Query(new string[] { url0, refUrl0, anchorText0, surroundingText0 }, LINK_SUGGESTION_BOOSTS,
                proxy.CachePath, 0, amount0, true, -1);

            // remove the referrer page from the results
            for (int i = 0; i < luceneResults.Results.Count; i++)
            {
                if (luceneResults.Results[i].URI.ToLower().Equals(refUrl.ToLower()))
                {
                    luceneResults.RemoveDocument(i);
                    break;
                }
            }
            // In the rare case that the referrer page was not among the results, we have to remove the last result
            if (luceneResults.Results.Count > amount0)
            {
                luceneResults.RemoveDocument(luceneResults.Results.Count - 1);
            }

            if (Properties.Network.Default.LS_DEBUG)
            {
                bool sameDomain = new Uri(refUrl).Host.Equals(new Uri(url).Host);

                bool found = false;
                // If we're debugging we want to find out the position of the original url.
                for (int i = 0; i < luceneResults.Results.Count; i++)
                {
                    if (luceneResults.Results[i].URI.ToLower().Equals(url.ToLower()))
                    {
                        found = true;
                        // Warn level makes it easier to find.
                        proxy.Logger.Warn(String.Format("LS_DEBUG: {0}|{1}|{2}|{3}", i, sameDomain, refUrl0, url0));
                        break;
                    }
                }
                if (!found)
                {
                    proxy.Logger.Warn(String.Format("LS_DEBUG: -1|{0}|{1}|{2}", sameDomain, refUrl0, url0));
                }
                return luceneResults.Take(amount);
            }

            return luceneResults;
        }