/// <summary> /// Weights all categories and subcategories by the number of search results, /// sorts the XML by that and saves search results with content snippets in the subcategories, /// that do actually appear in the index pages. /// </summary> /// <param name="path">The path to the directory where the XML file is in.</param> /// <param name="proxy">The proxy.</param> public static void CreateWeightsAndSaveSearchResults(string path, RCLocalProxy proxy) { proxy.Logger.Info("Ontology: Weighting ontology. This can take several minutes or hours."); string xmlFileName = path + IndexServer.CLUSTERS_XML_FILE_NAME; XmlDocument xmlDoc = IndexServer.GetClustersXMLDocument(xmlFileName); lock (xmlDoc) { XmlElement rootXml = xmlDoc.DocumentElement; if (rootXml == null) { proxy.Logger.Warn("Ontology: No proper clusters.xml with ontology. Aborting weighting."); return; } proxy.Logger.Debug("Ontology: Step 1/3: Getting number of cached items."); int limit = proxy.ProxyCacheManager.CachedItems(); if (limit == 0) { limit = 1; } int i = 1; foreach (XmlElement categoryElement in rootXml.ChildNodes) { proxy.Logger.Debug(String.Format("Ontology: Step 2/3: Calculating weights for category ({0}/{1}): {2}", i, rootXml.ChildNodes.Count, categoryElement.GetAttribute(IndexServer.INDEX_FEATURES_XML_ATTR))); // Determine the weight for the category and all subcategories DetermineWeight(categoryElement, proxy, limit); foreach (XmlElement subcategoryElement in categoryElement.ChildNodes) { DetermineWeight(subcategoryElement, proxy, limit); } i++; } SortByWeight(rootXml); // Getting search results for all subcategories visible on the index page. for (int catNo = 0; catNo < Math.Min(IndexServer.NUMBER_OF_CATEGORIES, rootXml.ChildNodes.Count); catNo++) { proxy.Logger.Debug(String.Format("Ontology: Step 3/3: Getting number of cached items for category ({0}/{1})", catNo + 1, IndexServer.NUMBER_OF_CATEGORIES)); for (int subcatNo = 0; subcatNo < Math.Min(IndexServer.NUMBER_OF_SUBCATEGORIES, rootXml.ChildNodes[catNo].ChildNodes.Count); subcatNo++) { AppendSearchResults(rootXml.ChildNodes[catNo].ChildNodes[subcatNo] as XmlElement, proxy, IndexServer.NUMBER_OF_LINKS); } } // Set timestamp for the new clusters.xml rootXml.SetAttribute("time", "" + DateTime.Now.ToFileTime()); // Save new xml xmlDoc.Save(xmlFileName); } proxy.Logger.Info("Ontology: Finished successfully."); }
/// <summary> /// Appends lucene search results to a subcategory element. /// </summary> /// <param name="subCategoryElement">The element.</param> /// <param name="proxy">The proxy.</param> /// <param name="numberOfResults">The maximum nunber of results to add.</param> private static void AppendSearchResults(XmlElement subCategoryElement, RCLocalProxy proxy, int numberOfResults) { string title = subCategoryElement.GetAttribute(IndexServer.INDEX_FEATURES_XML_ATTR); // Do a Lucene search SearchResults luceneResults = proxy.IndexWrapper.Query( title, proxy.CachePath, 0, numberOfResults, true, -1); // Remove current children subCategoryElement.RemoveAllChilds(); // Add the results to the XML LocalInternalRequestHandler.AppendSearchResultsXMLElements(luceneResults, subCategoryElement.OwnerDocument, subCategoryElement); }
/// <summary> /// Creates the clusters. /// /// </summary> /// <param name="k">The number of clusters to create.</param> /// <param name="catNFeatures">The maximum number of features for a category.</param> /// <param name="subcatNFeatures">The maximum number of features for a subcategory.</param> /// <param name="hierarchical">If the clusters should be organized hierarchical.</param> /// <param name="maxCategories">The maximum number of categories.</param> /// <param name="clustersPath">The path to the clusters folder.</param> /// <param name="proxy">The proxy.</param> public static void CreateClusters(int k, int catNFeatures, int subcatNFeatures, bool hierarchical, int maxCategories, string clustersPath, RCLocalProxy proxy) { proxy.Logger.Info("Clustering: Creating clusters. This may take around an hour!"); // Measure what part takes what time Stopwatch stopwatch = new Stopwatch(); // Filenames string docFileName = clustersPath + DOC_FILE_NAME; string matFileName = clustersPath + MAT_FILE_NAME; string clustersFileName = clustersPath + CLUSTERS_FILE_NAME; string xmlBTFileName = clustersPath + CLUSTERS_BT_XML_FILE_NAME; string xmlFileName = clustersPath + IndexServer.CLUSTERS_XML_FILE_NAME; // get files proxy.Logger.Debug("Clustering (1/6): Getting all text files."); stopwatch.Start(); List<string> textFiles = proxy.ProxyCacheManager.TextFiles(); stopwatch.Stop(); proxy.Logger.Debug("Custering (1/6): Getting all text files took: " + stopwatch.Elapsed.TotalSeconds + "s"); // Abort if we're having less than 2 text files if (textFiles.Count < 2) { proxy.Logger.Debug("Clustering: Less than 2 text files, aborting."); return; } // List number of text files proxy.Logger.Debug(String.Format("Clustering (1/6): Using {0} text files.", textFiles.Count)); List<string> titles; // files2doc proxy.Logger.Debug("Clustering (2/6): Creating docfile."); stopwatch.Restart(); try { titles = Cluster.CreateDocFile(textFiles, docFileName); } catch (IOException e) { proxy.Logger.Warn("Clustering: DocFile creation failed.", e); return; } stopwatch.Stop(); proxy.Logger.Debug("Custering (2/6): Creating docfile took: " + stopwatch.Elapsed.TotalSeconds + "s"); // doc2mat proxy.Logger.Debug("Clustering (3/6): Doc2Mat."); stopwatch.Restart(); try { Doc2Mat.DoDoc2Mat(docFileName, matFileName); } catch (Exception e) { proxy.Logger.Warn("Clustering: Doc2Mat failed.", e); return; } stopwatch.Stop(); proxy.Logger.Debug("Custering (3/6): Doc2Mat took: " + stopwatch.Elapsed.TotalSeconds + "s"); // ClutoClusters proxy.Logger.Debug("Clustering (4/6): Cluto-Clustering."); string treeFileName = null; HashSet<string>[] features; stopwatch.Restart(); try { if (hierarchical) { treeFileName = clustersPath + TREE_FILE_NAME; features = Cluster.CreateClusters(matFileName, clustersFileName, k, true, treeFileName, catNFeatures, subcatNFeatures); } else { features = Cluster.CreateClusters(matFileName, clustersFileName, k, false, "", catNFeatures, subcatNFeatures); } } catch (Exception e) { proxy.Logger.Warn("Clustering: Cluto failed.", e); return; } stopwatch.Stop(); proxy.Logger.Debug("Custering (4/6): Cluto-Clustering took: " + stopwatch.Elapsed.TotalSeconds + "s"); // Create binary tree XML file proxy.Logger.Debug("Clustering (5/6): Creating clustersBT.xml."); stopwatch.Restart(); try { Cluster.CreateClusterBTXMLFile(textFiles, features, clustersFileName, (hierarchical ? treeFileName : ""), xmlBTFileName, k, proxy.CachePath.Length, titles); } catch (Exception e) { proxy.Logger.Warn("Clustering: Creating XML failed.", e); return; } stopwatch.Stop(); proxy.Logger.Debug("Clustering (5/6): Creating clustersBT.xml took " + stopwatch.Elapsed.TotalSeconds + " s"); // Create XML file proxy.Logger.Debug("Clustering (6/6): Creating clusters.xml."); stopwatch.Restart(); try { Cluster.CreateClusterXMLFile(xmlFileName, xmlBTFileName, maxCategories); } catch (Exception e) { proxy.Logger.Error("Clustering: Creating clusters.xml failed.", e); return; } stopwatch.Stop(); proxy.Logger.Debug("Custering (6/6): Creating clusters.xml took: " + stopwatch.Elapsed.TotalSeconds + "s"); proxy.Logger.Info("Clustering: Finished successfully."); }
/// <summary> /// Determines the weight for a (sub)category. /// </summary> /// <param name="element">The XML element</param> /// <param name="limit">The upper limit for number of search results, which is used as weight.</param> /// <param name="proxy">The proxy.</param> private static void DetermineWeight(XmlElement element, RCLocalProxy proxy, int limit) { string title = element.GetAttribute(IndexServer.INDEX_FEATURES_XML_ATTR); int weight = proxy.IndexWrapper.NumberOfResults(title, limit); // Set weight element.SetAttribute(IndexServer.INDEX_WEIGHT_XML_ATTR, "" + weight); }
/// <summary> /// Computes the 3rd level in the hierarchy for a given category and subcategory. /// </summary> /// <param name="clusterXMLFile">The path to clusters.xml</param> /// <param name="categoryId">The category id.</param> /// <param name="subCategoryId">The subcategory id.</param> /// <param name="proxy">Proxy access to conduct a Lucene search.</param> /// <returns>The index.xml string.</returns> public static string Level3Index(string clusterXMLFile, string categoryId, string subCategoryId, RCLocalProxy proxy) { XmlDocument clustersDoc = GetClustersXMLDocument(clusterXMLFile); XmlDocument indexDoc = new XmlDocument(); indexDoc.AppendChild(indexDoc.CreateXmlDeclaration("1.0", "UTF-8", String.Empty)); XmlElement indexRootXml = indexDoc.CreateElement(INDEX_CATEGORIES_XML_NAME); indexDoc.AppendChild(indexRootXml); indexRootXml.SetAttribute(INDEX_LEVEL_XML_ATTR, String.Empty + 3); // Find category and subcategory element XmlElement categoryElement, subCategoryElement; categoryElement = FindCategory(clustersDoc.DocumentElement, categoryId); if (categoryElement == null) { throw new ArgumentException("Could not find category with that id."); } subCategoryElement = FindCategory(categoryElement, subCategoryId); if (subCategoryElement == null) { throw new ArgumentException("Could not find subcategory with that id."); } // Import category XmlNode category = indexRootXml.AppendChild(indexDoc.ImportNode(categoryElement, false)); // Import subcategory XmlNode subCategory = category.AppendChild(indexDoc.ImportNode(subCategoryElement, false)); if (subCategoryElement.ChildNodes.Count == 0) { // Do a Lucene search, if there are no items. SearchResults luceneResults = proxy.IndexWrapper.Query( subCategoryElement.GetAttribute(INDEX_FEATURES_XML_ATTR), proxy.CachePath, 0, NUMBER_OF_LINKS, true, -1); // Add the results to the XML LocalInternalRequestHandler.AppendSearchResultsXMLElements(luceneResults, indexDoc, subCategory as XmlElement); } else { // Import up to maxItems items for (int i = 0; i < subCategoryElement.ChildNodes.Count && (NUMBER_OF_LINKS == 0 || i < NUMBER_OF_LINKS); i++) { subCategory.AppendChild(indexDoc.ImportNode(subCategoryElement.ChildNodes[i], true)); } } return indexDoc.InnerXml; }
/// <summary> /// Removes all dead links from the index. /// </summary> /// <param name="proxy">The proxy, to log and to gain access to the cache manager.</param> public void RemoveAllDeadLinks(RCLocalProxy proxy) { proxy.Logger.Info(String.Format("The index contains {0} documents.", IndexItemCount())); proxy.Logger.Info("Deleting all dead links from index..."); Lucene.Net.Store.FSDirectory directory = Lucene.Net.Store.FSDirectory.Open(new System.IO.DirectoryInfo(_indexPath)); IndexReader reader = IndexReader.Open(directory, true); IndexWriter writer = new IndexWriter(_indexPath, _analyzer, false); int deleted = 0; int cacheFailures = 0; for (int i = 0; i < reader.MaxDoc(); i++) { if (reader.IsDeleted(i)) { continue; } proxy.Logger.Debug(i + " files scanned out of: " + reader.MaxDoc()); Document doc = reader.Document(i); string uri = doc.Get("uri"); string relFileName = CacheManager.GetRelativeCacheFileName(uri, "GET"); string absFileName = proxy.ProxyCacheManager.CachePath + relFileName; if (!proxy.ProxyCacheManager.IsCached(relFileName)) { if (File.Exists(absFileName)) { cacheFailures++; proxy.Logger.Warn(String.Format( "Cache Failure {0}. Not cached but file exists: {1}", cacheFailures, relFileName)); NameValueCollection headers = new NameValueCollection() { // We need to include content-type, as we always want that header! { "Content-Type", "text/html"} }; // The index entry already exists, so we don't want to create a new one here. GlobalCacheItemToAdd newItem = new GlobalCacheItemToAdd(relFileName, headers, 200, false); // Add file to the database proxy.ProxyCacheManager.AddCacheItemsForExistingFiles(new HashSet<GlobalCacheItemToAdd>() { newItem }); continue; } deleted++; proxy.Logger.Info(String.Format("This is doc number {0} we delete. Deleting {1} from the lucene index.", deleted, relFileName)); writer.DeleteDocuments(new Term("uri", uri)); if (deleted % 100 == 0) { writer.Commit(); } } } writer.Commit(); reader.Close(); writer.Close(); proxy.Logger.Info(String.Format( "Deleted all dead links from index. Deleted {0} index items and added {1} DB items.", deleted, cacheFailures)); }
/// <summary> /// Instantiate a new CrawlerWrapper. /// </summary> /// <param name="proxy">The local proxy.</param> public CrawlerWrapper(RCLocalProxy proxy) { this._proxy = proxy; }
/// <summary> /// Tests ALL link suggestions in LS_DEBUG mode. Should only be used for small caches. /// </summary> /// <param name="proxy">The local proxy.</param> public static void DebugLinkSuggestion(RCLocalProxy proxy) { CacheManager cm = proxy.ProxyCacheManager; string[] stopwords = File.ReadAllLines("stopwords.txt"); List<string> sites = cm.TextFiles(); int i = 1; foreach (string site in sites) { proxy.Logger.Debug(String.Format("Working site {0} of {1}", i++, sites.Count)); string url = cm.AbsoluteFilePathToUri(site); string content = Utils.ReadFileAsString(site); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(content); HtmlNode head = doc.DocumentNode.SelectSingleNode("/html/head"); HtmlNode body = doc.DocumentNode.SelectSingleNode("/html/body"); if (head == null || body == null) { // We haven't sane HTML, just continue continue; } // Gets all links containing text. HtmlNodeCollection links = doc.DocumentNode.SelectNodes("//a[not(node()[2])][text()]/@href"); if (links != null) { int ign; foreach (HtmlNode link in links) { string relTarget = link.GetAttributeValue("href", ""); string target; try { target = new Uri(new Uri(url), relTarget).ToString(); } catch (UriFormatException) { continue; } if (LINK_ANCHOR_BLACKLIST.Contains(link.InnerText.ToLower()) || Int32.TryParse(link.InnerText, out ign) || url.ToLower().Equals(target.ToLower())) { // No "Here", ... links or number links // No links to the same page! (LS_DEBUG) continue; } string relFileName = CacheManager.GetRelativeCacheFileName(target, "GET"); if (!target.Equals("") && cm.IsCached(relFileName)) { // Get anchor and surrounding text string anchorText = link.InnerText; string surroundingText = HtmlUtils.GetSurroundingText(link, stopwords, false); // Trigger LS GetLinkSuggestions(target, url, anchorText, surroundingText, 3, proxy); } } } } }
/// <summary> /// Gets the link suggestions for an uncached link. /// </summary> /// <param name="url">The absolute URL.</param> /// <param name="refUrl">The referer URL.</param> /// <param name="anchorText">The anchor text.</param> /// <param name="surroundingText">The surrounding text.</param> /// <param name="amount">The amount of suggestions to get.</param> /// <param name="proxy">The local proxy.</param> /// <returns>The suggestions.</returns> public static IEnumerable<SearchResult> GetLinkSuggestions(string url, string refUrl, string anchorText, string surroundingText, int amount, RCLocalProxy proxy) { // Remove all http:// or https:// from the query string url0 = url.Replace("http://", "").Replace("https://", ""); string refUrl0 = refUrl.Replace("http://", "").Replace("https://", ""); string anchorText0 = anchorText.Replace("http://", "").Replace("https://", ""); string surroundingText0 = surroundingText.Replace("http://", "").Replace("https://", ""); // If we're debugging, we want 51 results, otherwise // we want one result more, as we're very probably going to find the referrer page int amount0 = Properties.Network.Default.LS_DEBUG ? LS_DEBUG_NUM_RESULTS : amount + 1; SearchResults luceneResults = proxy.IndexWrapper.Query(new string[] { url0, refUrl0, anchorText0, surroundingText0 }, LINK_SUGGESTION_BOOSTS, proxy.CachePath, 0, amount0, true, -1); // remove the referrer page from the results for (int i = 0; i < luceneResults.Results.Count; i++) { if (luceneResults.Results[i].URI.ToLower().Equals(refUrl.ToLower())) { luceneResults.RemoveDocument(i); break; } } // In the rare case that the referrer page was not among the results, we have to remove the last result if (luceneResults.Results.Count > amount0) { luceneResults.RemoveDocument(luceneResults.Results.Count - 1); } if (Properties.Network.Default.LS_DEBUG) { bool sameDomain = new Uri(refUrl).Host.Equals(new Uri(url).Host); bool found = false; // If we're debugging we want to find out the position of the original url. for (int i = 0; i < luceneResults.Results.Count; i++) { if (luceneResults.Results[i].URI.ToLower().Equals(url.ToLower())) { found = true; // Warn level makes it easier to find. proxy.Logger.Warn(String.Format("LS_DEBUG: {0}|{1}|{2}|{3}", i, sameDomain, refUrl0, url0)); break; } } if (!found) { proxy.Logger.Warn(String.Format("LS_DEBUG: -1|{0}|{1}|{2}", sameDomain, refUrl0, url0)); } return luceneResults.Take(amount); } return luceneResults; }