private void loadWorker_DoWork(object sender, DoWorkEventArgs e) { // Create new tree string[] parsedData = null; int TableNumber = 0; try { // Get list of websites string[][] AllSites = new string[5][]; TableNumber = GetNumberOfKeys() + 1; int sitesRemaining = 1531 - TableNumber; for (int i = 0; i < AllSites.Length; ++i) { AddMessage($"Getting site list for '{Enum.GetName(typeof (WebCompareModel.SitesEnum), i)}' cluster"); AllSites[i] = WebCompareModel.GetSiteList(WebCompareModel.Websites[i], (sitesRemaining / 5)); } // Build frequency tables from 1000+ sites int sitecateg = 1, statusCount = 1, scx = 1; foreach (string[] sites in AllSites) { AddMessage("Building frequency tables for cluster.. " + sitecateg); foreach (string site in sites) { // Status display if (statusCount == 100) { AddMessage($"Sites to go..{sites.Count() * 5 - scx + 70}"); statusCount = 0; } ++statusCount; ++scx; // Get data from website and parse parsedData = WebCompareModel.GetWebDataAgility(site); // Fill a new HTable (frequency table) HTable table = new HTable(); table.URL = site; table.Name = site.Substring(30); if (parsedData != null) { for (int w = 0; w < parsedData.Length; ++w) { table.Put(parsedData[w], 1); } } // Write HTable to file table.SaveTable(TableNumber); // Add HTable to BTree, including write to file Session.Instance.Tree.Insert(TableNumber, table.Name); ++TableNumber; } AddMessage("Completed building frequency table " + sitecateg); ++sitecateg; } // End AllSites foreach } catch (Exception err) { MessageBox.Show("Exception caught: " + err, "Exception:Loader:loaderWorker_DoWork()", MessageBoxButton.OK, MessageBoxImage.Warning); } }
public Vertex BuildGraph(string site, Vertex parent, out float sim) { // Decrement count because we just went down a level --levelCount; sim = 0; if (siteCount >= MAXSITES) { return(null); } try { // Take one site and a parent vertex // Get data and list of sites List <string> parsedData; List <string> sites; GetWebDataAgility(site, out parsedData, out sites); // Fill a new HTable (frequency table) HTable vTable = new HTable(); vTable.ID = ++TableNumber; vTable.URL = site; vTable.Name = site.Substring(30); if (parsedData != null) { for (int w = 0; w < parsedData.Count; ++w) { vTable.Put(parsedData[w], 1); } } // Write HTable to file vTable.SaveTable(vTable.ID); // Create a vertex for this site with the same ID as HTable Vertex v = new Vertex(vTable.ID, vTable.URL); // Create an edge connecting this vertex and parent vertex if (parent != null) { // Calc similiarty to parent HTable parentTable = LoadTable(parent.ID); List <object>[] vector = WebCompareModel.BuildVector(vTable, parentTable); //// Calcualte similarity, 1 minus for shortest path calculation, lower numbers are more similar sim = 1 - (float)WebCompareModel.CosineSimilarity(vector); //Create edge to parent Edge e1 = new Edge(parent.ID, v.ID, sim, ++EdgeNumber); Edge e2 = new Edge(v.ID, parent.ID, sim, ++EdgeNumber); //mainGraph.AddEdge(e1); // Add edge to Graph list //mainGraph.SaveEdge(e1); // Write edge to disk //mainGraph.AddEdge(e2); // Add edge to Graph list //mainGraph.SaveEdge(e2); // Write edge to disk // Add eachother as neighbors parent.AddNeighbor(e1); v.AddNeighbor(e2); // Update parent vertex mainGraph.AddVertex(parent); mainGraph.SaveVertex(parent); } // Update/Add to graph mainGraph.AddVertex(v); mainGraph.SaveVertex(v); ++siteCount; // Add list of sites to this vertex //// Forach- add, recursively call this method foreach (var s in sites) { // Check for nulls, Don't compare to itself if (s == null || s == site) { continue; } // Don't get more sites if site tree has been built already Vertex v2 = mainGraph.HasVertex(s); if (v2 != null) { LoadStatus += "."; // Calc similiarty to parent HTable v2Table = LoadTable(v2.ID); sim = 0; // clear if (v2Table != null) { List <object>[] vector = WebCompareModel.BuildVector(vTable, v2Table); //// Calcualte similarity, 1 minus for shortest path calculation, lower numbers are more similar sim = 1 - (float)WebCompareModel.CosineSimilarity(vector); //Create edge to parent Edge e = new Edge(v.ID, v2.ID, (float)sim, ++EdgeNumber); //mainGraph.AddEdge(e); // Add edge to Graph list //mainGraph.SaveEdge(e); // Write edge to disk // Add eachother as neighbors v.AddNeighbor(e); v2.AddNeighbor(new Edge(v2.ID, v.ID, (float)sim, ++EdgeNumber)); // Update/Add to graph mainGraph.AddVertex(v); mainGraph.AddVertex(v2); } mainGraph.SaveVertex(v2); } else { float simout = 0; Vertex neighb = null; // Don't build another graph off this node if the level count hits 0 if (levelCount > 0) { neighb = BuildGraph(s, v, out simout); // Increment levelcount because we just came back up a level ++levelCount; } else { return(v); } if (neighb != null) { Edge newEdge = new Edge(v.ID, neighb.ID, simout, EdgeNumber++); v.AddNeighbor(newEdge); //mainGraph.SaveEdge(newEdge); } } } // Update Vertex to graph and persist if (v != null) { mainGraph.AddVertex(v); mainGraph.SaveVertex(v); } return(v); } catch (Exception exc) { Console.WriteLine("Error building graph: " + exc); } sim = 0; return(null); }
private void worker_DoWork(object sender, DoWorkEventArgs e) { AddMessage(""); wcViewModel.Results = ""; wcViewModel.Results = "Top 10 most similar websites: "; // Build frequency table for user entered URL AddMessage("Building Entered URL frequency table.."); // Get data from websit and parse string[] parsedData = WebCompareModel.GetWebDataAgility(wcViewModel.UserURL); // Fill HTable HTable compareTable = new HTable(); compareTable.URL = wcViewModel.UserURL; compareTable.Name = wcViewModel.UserURL.Substring(30); for (int w = 0; w < parsedData.Length; ++w) { compareTable.Put(parsedData[w], 1); } // Array of keyvalue pairs for the top 100 closest sites List <KeyValuePair <long, double> > topSites = new List <KeyValuePair <long, double> >(); // Foreach sites, AddMessage("\nCalculating Similarities for 1000+ webistes.."); SiteTotal = LoaderViewModel.GetNumberOfKeys(); int statusCount = 0; AddMessage($"\nSites to go..{SiteTotal}"); for (int i = 1; i < SiteTotal; ++i) { // Status display ++statusCount; if (statusCount == 50) { AddMessage($"\nSites to go..{SiteTotal - i + 70}"); statusCount = 0; } //// Build Vector HTable tempTable = LoadTable(i); if (tempTable != null) { List <object>[] vector = WebCompareModel.BuildVector(tempTable, compareTable); //// Calcualte similarity tempTable.Similarity = WebCompareModel.CosineSimilarity(vector); //// Update table tempTable.SaveTable(i); //// Maintain array of top 100 sites (IDs) topSites = AddTopSite(topSites, new KeyValuePair <long, double>(i, tempTable.Similarity)); //// Check if the stored site needs updating DateTime compDate = DateTime.Now.Subtract(new TimeSpan(30, 0, 0, 0)); if (tempTable.LastUpdated < compDate) { //// Update the HTable UpdateTable(ref tempTable); } } } // For top 10 websites for (int i = 0; i < 10; ++i) { //// Get Name of site using Key //// Display 10 sites string siteName = Tree.SearchTree(topSites[i].Key, 1); wcViewModel.Results += "\n" + siteName; } // Calculate and Display most similar GetResult(topSites); }