public static HashSet <Feature> KeepTopK(HashSet <Feature> featureSet, int k) { //this line is here to disable the filtering, it costs in performance much more than it saves if (featureSet.Count() <= k * 200) { return(featureSet); } HashSet <Feature> finalRes = null; LinkedList <object[]> toSort = new LinkedList <object[]>(); foreach (Feature f in featureSet) { HashSet <HtmlNode> res = DomPool.RunXpathQuery(XpathTools.FeatureSetToXpath(new HashSet <Feature>(new Feature[] { f }))); HashSet <HtmlNode> selectedPos = new HashSet <HtmlNode>(res.Intersect(DomPool.TargetNodes)); double entropy = Statistics.CalculateEntropy(((double)selectedPos.Count() / res.Count()), 1 - ((double)selectedPos.Count() / res.Count())); object[] toSortObj = new object[2]; toSortObj[0] = f; toSortObj[1] = entropy; toSort.AddFirst(toSortObj); } var resTopK = toSort.OrderBy(x => ((double)(x[1]))).Select(x => (Feature)(x[0])).Take(k); finalRes = new HashSet <Feature>(resTopK.ToList()); return(finalRes); }
public HashSet <HtmlNode> RunOnTestSeenSet() { HashSet <HtmlNode> classifierSelectedNodes = new HashSet <HtmlNode>(); InitTestSeen(); foreach (string featureString in FeaturesUsed) { HashSet <HtmlNode> resNodes = DomPool.TESTSeenRunXpathQuery(useNormalPerformanceQUERY(featureString)); foreach (HtmlNode nd in resNodes) { if (!testSeenAllNodes.Contains(nd)) { continue; } testSeenNodeFeatures[nd].Add(featureString); } } FastVector fvWekaAttributes = GetDataSetAtts(); Instances testSet = new Instances("TestSeenSet", fvWekaAttributes, 10); testSet.setClassIndex(fvWekaAttributes.size() - 1); foreach (HtmlNode currNode in testSeenAllNodes) { Instance item = new SparseInstance(fvWekaAttributes.size()); for (int i = 0; i < fvWekaAttributes.size() - 1; i++) { weka.core.Attribute currFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(i); if (testSeenNodeFeatures[currNode].Contains(currFeature.name())) { item.setValue(currFeature, 1); } else { item.setValue(currFeature, 0); } } //set the class weka.core.Attribute classFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(fvWekaAttributes.size() - 1); //string rightVal = DomPool.TargetNodes.Contains(currNode) ? "yes" : "no"; item.setDataset(testSet); double classifierdv = classifierTree.classifyInstance(item); string classifierVal = classFeature.value((int)classifierdv); if (classifierVal.Equals("yes")) { classifierSelectedNodes.Add(currNode); } testSet.add(item); } return(classifierSelectedNodes); }
public HashSet <HtmlNode> selectTrue(HashSet <HtmlNode> nodes, HashSet <Feature> prevFeatures, Boolean right, double threshold = 1) { if (this.precision >= threshold) { return(nodes); } HashSet <Feature> currFeature = new HashSet <Feature>(this.FeatureSet.Except(prevFeatures)); if (currFeature.Count() == 0) { if (right) { return(nodes); } else { return(new HashSet <HtmlNode>()); } } Feature cf = currFeature.First(); HashSet <HtmlNode> featureRes = DomPool.RunXpathQuery(XpathTools.FeatureSetToXpath(new HashSet <Feature>() { cf })); featureRes.IntersectWith(nodes); HashSet <HtmlNode> rightRes = this.SetSelected.selectTrue(featureRes, this.FeatureSet, true, threshold); HashSet <HtmlNode> leftRes = this.SetNotSelected.selectTrue(nodes, prevFeatures, false, threshold); return(new HashSet <HtmlNode>(rightRes.Union(leftRes))); }
public static string LearnXpathFromTrainingFiles(string filesLocation) { DomPool.LoadDocuments(filesLocation); DomPool.Initiate(); DomPool.ExtractAllFeatures(); DecisionNode dn = new DecisionNode(); dn.InitialNodeSet = new HashSet <HtmlNode>(DomPool.TargetNodes.Union(DomPool.NonTargetNodes)); dn.SelectedNegative = new HashSet <HtmlNode>(DomPool.NonTargetNodes.Except(DomPool.TargetNodesPrecision)); dn.SelectedPositive = new HashSet <HtmlNode>(DomPool.TargetNodes); dn.FeatureSet = new HashSet <Feature>(); dn.CalculateEntropy(); DecisionTreeLearning.RecursiveTreeImprovement(dn); return(XpathTools.GenerateAForgivingXpath(dn)); //"//*["+XpathTools.DecisionTreeToXpath(dn,new HashSet<Feature>())+"]"; }
public static void RunTest(string filesLocation) { string[] folders = Directory.GetDirectories(filesLocation); foreach (string fldr in folders) { Console.WriteLine("Running for category:" + fldr); string[] innerfolders = Directory.GetDirectories(fldr); foreach (string innerdir in innerfolders) { Console.Out.Flush(); Console.WriteLine("Running for att:" + innerdir); DomPool.LoadTestDocuments(innerdir.Replace(filesLocation, "testset")); DomPool.LoadDocuments(innerdir); //for(int i= (DomPool.allDocsNames.Count() - 1); i <= (DomPool.allDocsNames.Count()-1)/*DomPool.allDocsNames.Count()*/; i++) for (int i = 1; i <= (DomPool.allDocsNames.Count() - 1); i++) { string[] tools = new string[] { "our", "our - not forgiving", "j48", "svm", "xpath-align", "svm" }; int toolStart = 5; Dictionary <string, string> xpathNonForgiving = new Dictionary <string, string>(); for (int tool = toolStart; tool < 6; tool++) { Console.WriteLine("[-] running for training set size=" + i); IEnumerable <IEnumerable <int> > subsetsIndexes = Subsets(DomPool.allDocsNames.Count(), i); //Reduce size ...for testing only //subsetsIndexes = subsetsIndexes.Take(30); double totalAccuracy = 0; double totalRecall = 0; long totalTime = 0; Console.WriteLine("[-] tool:" + tools[tool]); Console.WriteLine("+ will run " + subsetsIndexes.Count() + " different iterations for the current set size"); int s = 0; Dictionary <String, double> SiteTotalRecall = new Dictionary <string, double>(); Dictionary <String, double> SiteTotalPrecision = new Dictionary <string, double>(); Dictionary <String, double> SiteTotalTests = new Dictionary <string, double>(); foreach (string site in DomPool.allDocsNames) { SiteTotalPrecision[site] = 0; SiteTotalRecall[site] = 0; SiteTotalTests[site] = 0; } foreach (IEnumerable <int> currSubsetIndexes in subsetsIndexes) { List <int> listRep = new List <int>(currSubsetIndexes); string stringRep = listRep.Aggregate("", (b, x) => b + "," + x); s++; if (s % 10 == 0) { //Console.Write("(" + s + "/" + subsetsIndexes.Count() + ") "); Console.Write("."); } //if (tool == toolStart) //{ HashSet <String> currSubset = GetSubSet(DomPool.allDocsNames, currSubsetIndexes); DomPool.Initiate(currSubset); DomPool.ExtractAllFeatures(); //} var runres = new HashSet <HtmlNode>(); //our method if (tool < 2) { string xpath = ""; if (tool == 0) { DecisionNode dn = new DecisionNode(); dn.InitialNodeSet = new HashSet <HtmlNode>(DomPool.TargetNodes.Union(DomPool.NonTargetNodes)); dn.SelectedNegative = new HashSet <HtmlNode>(DomPool.NonTargetNodes.Except(DomPool.TargetNodesPrecision)); dn.SelectedPositive = new HashSet <HtmlNode>(DomPool.TargetNodes); dn.FeatureSet = new HashSet <Feature>(); dn.CalculateEntropy(); DecisionTreeLearning.RecursiveTreeImprovement(dn); xpath = XpathTools.GenerateAForgivingXpath(dn); xpathNonForgiving[stringRep] = XpathTools.DecisionTreeToXpath(dn, new HashSet <Feature>(), 1); xpathNonForgiving[stringRep] = "//*" + (xpathNonForgiving[stringRep].Equals("") ? "" : ("[" + xpathNonForgiving[stringRep] + "]")); } if (tool == 1) { xpath = xpathNonForgiving[stringRep]; } Console.WriteLine("Query:" + xpath); var watch = Stopwatch.StartNew(); runres = DomPool.TESTRunXpathQuery(xpath); watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; totalTime = totalTime + elapsedMs; } else { if (tool == 2) { ModelLearner model = new ModelLearner(); model.LearnModel(); var watch = Stopwatch.StartNew(); runres = model.RunOnTestSet(); watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; totalTime = totalTime + elapsedMs; } else { if (tool == 3) { NB model = new NB(); model.LearnModel(); var watch = Stopwatch.StartNew(); runres = model.RunOnTestSet(); watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; totalTime = totalTime + elapsedMs; } else { if (tool == 4) { XpathAlignment model = new XpathAlignment(); model.LearnModel(); var watch = Stopwatch.StartNew(); runres = model.RunOnTestSet(); watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; totalTime = totalTime + elapsedMs; } else { SVM model = new SVM(); model.LearnModel(); var watch = Stopwatch.StartNew(); runres = model.RunOnTestSet(); watch.Stop(); var elapsedMs = watch.ElapsedMilliseconds; totalTime = totalTime + elapsedMs; } } } } HashSet <HtmlNode> spos = new HashSet <HtmlNode>(DomPool.TESTTargetNodes.Intersect(runres)); HashSet <HtmlNode> sposprecision = new HashSet <HtmlNode>(DomPool.TESTTargetNodesPrecision.Intersect(runres)); foreach (var entry in DomPool.docsAndNames) { if (DomPool.trainingDocsNames.Contains(entry.Key)) { continue; } HashSet <HtmlNode> docNodes = new HashSet <HtmlNode>(entry.Value.SelectNodes("//*")); HashSet <HtmlNode> currspos = new HashSet <HtmlNode>(spos.Intersect(docNodes)); HashSet <HtmlNode> currrunres = new HashSet <HtmlNode>(runres.Intersect(docNodes)); HashSet <HtmlNode> currsposprecision = new HashSet <HtmlNode>(sposprecision.Intersect(docNodes)); HashSet <HtmlNode> currTargetNodes = new HashSet <HtmlNode>(DomPool.TESTTargetNodes.Intersect(docNodes)); double currSiteAccuracy = (currsposprecision.Count() / ((double)currrunres.Count())); double currSiteRecall = (currspos.Count() / ((double)currTargetNodes.Count())); if (((double)currrunres.Count()) > 0) { SiteTotalPrecision[entry.Key] = SiteTotalPrecision[entry.Key] + currSiteAccuracy; SiteTotalRecall[entry.Key] = SiteTotalRecall[entry.Key] + currSiteRecall; } SiteTotalTests[entry.Key] = SiteTotalTests[entry.Key] + 1; } double currAccuracy = (sposprecision.Count() / ((double)runres.Count())); double currRecall = (spos.Count() / ((double)DomPool.TESTTargetNodes.Count())); if (runres.Count() > 0) { totalAccuracy = totalAccuracy + currAccuracy; totalRecall = totalRecall + currRecall; } } totalAccuracy = totalAccuracy / subsetsIndexes.Count(); totalRecall = totalRecall / subsetsIndexes.Count(); Console.WriteLine("########## Results " + tools[tool] + " for i=" + i + "##########"); Console.WriteLine("+++++++++ Detailed Results for i=" + i + "++++++++++#"); double count = 0; double totalSumPrecision = 0; double totalSumRecall = 0; double avgRecall = 0; double avgPrecision = 0; double avgFscore = 0; double numPrecision = 0; foreach (string site in DomPool.allDocsNames) { if (SiteTotalTests[site] < 1) { SiteTotalTests[site]++; } else { numPrecision++; } double sitePrecision = SiteTotalPrecision[site] / SiteTotalTests[site]; double siteRecall = SiteTotalRecall[site] / SiteTotalTests[site]; double siteFscore = 2 * (sitePrecision * siteRecall) / (sitePrecision + siteRecall); if (siteRecall == 0 && sitePrecision == 0) { siteFscore = 0; } count++; avgRecall = avgRecall + siteRecall; avgPrecision = avgPrecision + sitePrecision; avgFscore = avgFscore + siteFscore; Console.WriteLine(">" + site + ": Precision:" + sitePrecision + " , Recall:" + siteRecall + ", F-score:" + siteFscore); } Console.WriteLine("++++++++++++++++Total+++++++++++++++++"); avgRecall = avgRecall / count; avgPrecision = avgPrecision / numPrecision; avgFscore = avgFscore / count; Console.WriteLine("Recall:" + avgRecall); Console.WriteLine("Precision:" + avgPrecision); Console.WriteLine("F-score:" + avgFscore); Console.WriteLine("Time:" + totalTime); } } } } Console.ReadLine(); }
public static void ImproveTree(DecisionNode dn, int level) { double maxScore = 0; Feature maxGainFeature = null;; HashSet <HtmlNode> newFeatureSelected = null; Object lockObj = new object(); double balanceFix = Math.Max(1, (Math.Pow(0.3, Math.Sqrt(level + 1))) * (DomPool.NonTargetNodes.Count() / DomPool.TargetNodes.Count())); double dnEntropy = dn.CalculateEntropy(1, balanceFix); Parallel.ForEach(DomPool.SelectorFeatures, (currCandidate) => { HashSet <Feature> newSelectorSet = new HashSet <Feature>(dn.FeatureSet); newSelectorSet.Add(currCandidate); string currFeatureXpath = XpathTools.FeatureSetToXpath(new HashSet <Feature>() { currCandidate }); HashSet <HtmlNode> currFeatureXpathSelected = DomPool.RunXpathQuery(currFeatureXpath); HashSet <HtmlNode> xpathSelected = new HashSet <HtmlNode>(currFeatureXpathSelected.Intersect(dn.InitialNodeSet)); HashSet <HtmlNode> xpathCurrSelected = new HashSet <HtmlNode>(dn.InitialNodeSet.Intersect(xpathSelected)); HashSet <HtmlNode> xpathCurrNotSelected = new HashSet <HtmlNode>(dn.InitialNodeSet.Except(xpathCurrSelected)); //calculate information gain HashSet <HtmlNode> currSelectedPositive = new HashSet <HtmlNode>(xpathCurrSelected.Intersect(dn.SelectedPositive)); HashSet <HtmlNode> currSelectedNegative = new HashSet <HtmlNode>(xpathCurrSelected.Intersect(dn.SelectedNegative)); HashSet <HtmlNode> currNotSelectedPositive = new HashSet <HtmlNode>(xpathCurrNotSelected.Intersect(dn.SelectedPositive)); HashSet <HtmlNode> currNotSelectedNegative = new HashSet <HtmlNode>(xpathCurrNotSelected.Intersect(dn.SelectedNegative)); double sp = ((double)currSelectedPositive.Count()) / xpathCurrSelected.Count(); double sn = ((double)currSelectedNegative.Count()) / xpathCurrSelected.Count(); double relativeRecall = ((double)currSelectedPositive.Count()) / ((double)dn.SelectedPositive.Count()); //FIX: sn = sn / (1 + Math.Pow(0, level + 1)); sn = sn / balanceFix; double selectedProbability = ((double)xpathCurrSelected.Count()) / dn.InitialNodeSet.Count(); double selectedEntropy = Statistics.CalculateEntropy(sp, sn); double nsp = ((double)currNotSelectedPositive.Count()) / xpathCurrNotSelected.Count(); double nsn = 1 - nsp; // Apply Fix nsn = nsn / balanceFix; double notselectedProbability = 1 - selectedProbability; double notSelectedEntropy = Statistics.CalculateEntropy(nsp, nsn); double balanceFixProb = balanceFix; double sumTemp = (selectedProbability * sn + selectedProbability * sp * balanceFixProb + notselectedProbability * nsn + notselectedProbability * nsp * balanceFixProb); selectedProbability = (selectedProbability * sn + selectedProbability * sp * balanceFixProb) / sumTemp; notselectedProbability = (notselectedProbability * nsn + notselectedProbability * nsp * balanceFixProb) / sumTemp; double gain = dnEntropy - ((selectedProbability * selectedEntropy) + (notselectedProbability * notSelectedEntropy)); double phaseOfDecrease = 1; if (DomPool.trainingDocsNames.Count() > 3) { phaseOfDecrease = 3 / DomPool.trainingDocsNames.Count(); } //Choose the most cost effective feature gain = gain / (currCandidate.cost + (((1 - relativeRecall) + (1 - ((double)DomPool.FeatureFrequencey[currCandidate.feature.First().ToLower()]) / DomPool.trainingDocsNames.Count))) * Math.Pow(0.3, level)); lock (lockObj) { if (gain > maxScore && sp > nsp) { maxScore = gain; maxGainFeature = currCandidate; newFeatureSelected = xpathCurrSelected; } } }); if (maxGainFeature == null) { return; } dn.SetSelected = new DecisionNode(); dn.SetSelected.InitialNodeSet = newFeatureSelected; dn.SetSelected.FeatureSet = new HashSet <Feature>(dn.FeatureSet); dn.SetSelected.FeatureSet.Add(maxGainFeature); dn.SetSelected.SelectedNegative = new HashSet <HtmlNode>(dn.SetSelected.InitialNodeSet.Intersect(dn.SelectedNegative)); dn.SetSelected.SelectedPositive = new HashSet <HtmlNode>(dn.SetSelected.InitialNodeSet.Intersect(dn.SelectedPositive)); dn.SetSelected.CalculateEntropy(); dn.SetNotSelected = new DecisionNode(); dn.SetNotSelected.InitialNodeSet = new HashSet <HtmlNode>(dn.InitialNodeSet.Except(newFeatureSelected)); //FIX FOR NOT BRANCH, INSTEAD OF HAVING THE NOT. if (FixEnabledForNotBranch) { dn.SetNotSelected.InitialNodeSet.UnionWith(dn.SetSelected.SelectedNegative); } dn.SetNotSelected.FeatureSet = new HashSet <Feature>(dn.FeatureSet); dn.SetNotSelected.SelectedNegative = new HashSet <HtmlNode>(dn.SetNotSelected.InitialNodeSet.Intersect(dn.SelectedNegative)); dn.SetNotSelected.SelectedPositive = new HashSet <HtmlNode>(dn.SetNotSelected.InitialNodeSet.Intersect(dn.SelectedPositive)); dn.SetNotSelected.CalculateEntropy(); dn.FeatureSet.Add(maxGainFeature); }
static void Main(string[] args) { Console.WriteLine("T for test, R for Run, S for seen overall testing and O for overall testing:"); string res = ReadLine(); if (res.ToLower().Trim().Equals("huge")) { TestSites.TestAllSites(); return; } if (res.ToLower().Trim().Equals("t")) { DomPool.LoadDocuments(FILES_LOCATION); DomPool.Initiate(); Console.WriteLine("insert query:"); string q = ReadLine(); while (!q.Equals("exit")) { var runres = DomPool.RunXpathQuery(q); if (runres != null) { Console.WriteLine("result size" + runres.Count()); HashSet <HtmlNode> spos = new HashSet <HtmlNode>(DomPool.TargetNodes.Intersect(runres)); HashSet <HtmlNode> sposprecision = new HashSet <HtmlNode>(DomPool.TargetNodesPrecision.Intersect(runres)); foreach (var entry in DomPool.docsAndNames) { HashSet <HtmlNode> docNodes = new HashSet <HtmlNode>(entry.Value.SelectNodes("//*")); HashSet <HtmlNode> currspos = new HashSet <HtmlNode>(spos.Intersect(docNodes)); HashSet <HtmlNode> currrunres = new HashSet <HtmlNode>(runres.Intersect(docNodes)); HashSet <HtmlNode> currsposprecision = new HashSet <HtmlNode>(sposprecision.Intersect(docNodes)); HashSet <HtmlNode> currTargetNodes = new HashSet <HtmlNode>(DomPool.TargetNodes.Intersect(docNodes)); Console.WriteLine(entry.Key + "-Accuracy:" + (currsposprecision.Count() / ((double)currrunres.Count())) + ". Recall:" + (currspos.Count() / ((double)currTargetNodes.Count())) + ""); } Console.WriteLine("Accuracy:" + (sposprecision.Count() / ((double)runres.Count())) + ". Recall:" + (spos.Count() / ((double)DomPool.TargetNodes.Count())) + ""); } else { Console.WriteLine("null"); } Console.WriteLine("insert query:"); q = ReadLine(); } } else { if (res.ToLower().Trim().Equals("r")) { Console.WriteLine(LearnXpathWrapper.LearnXpathFromTrainingFiles(FILES_LOCATION)); Console.ReadLine(); } else { if (res.ToLower().Trim().Equals("s")) { Console.WriteLine("Output is redirected to resultsSeen.txt in the debug dir"); //write results to text file instead of windows FileStream fs = new FileStream("resultsSeen.txt", FileMode.Create); StreamWriter sw = new StreamWriter(fs); TextWriter tmp = Console.Out; Console.SetOut(sw); OverallSeenTesting.RunTest(FILES_LOCATION); Console.SetOut(tmp); sw.Flush(); sw.Close(); } else { if (res.ToLower().Trim().Equals("archive")) { Console.WriteLine("Output is redirected to results.txt in the debug dir"); //write results to text file instead of windows FileStream fs = new FileStream("archive2-results.txt", FileMode.Create); StreamWriter sw = new StreamWriter(fs); TextWriter tmp = Console.Out; Console.SetOut(sw); OverallArchive2Testing.RunTest(ARCHIVE_FILES_LOCATION); Console.SetOut(tmp); sw.Flush(); sw.Close(); } else { if (res.ToLower().Trim().Equals("a")) { Console.WriteLine("Please enter file name to parse:"); string fnp = ReadLine().Trim(); parseres.learn(fnp); parseres.save("parsed" + fnp); } else { Console.WriteLine("Output is redirected to results.txt in the debug dir"); //write results to text file instead of windows FileStream fs = new FileStream("results.txt", FileMode.Create); StreamWriter sw = new StreamWriter(fs); TextWriter tmp = Console.Out; Console.SetOut(sw); OverallTesting.RunTest(FILES_LOCATION); Console.SetOut(tmp); sw.Flush(); sw.Close(); } } } } } }
public static void PorcessSites(List <SiteInfo> siteinfos) { foreach (SiteInfo si in siteinfos) { try { int leftPages = pagesNum * si.attributeExtraction.Keys.Count(); List <HtmlNode> sitePages = new List <HtmlNode>(0); if (!SiteDocuments.ContainsKey(si.SiteName)) { SiteDocuments[si.SiteName] = new Dictionary <string, Dictionary <String, HtmlNode> >(); SiteLinks[si.SiteName] = new Dictionary <string, HashSet <String> >(); foreach (String attrName in si.attributeExtraction.Keys) { SiteDocuments[si.SiteName][attrName] = new Dictionary <string, HtmlNode>(); SiteLinks[si.SiteName][attrName] = new HashSet <string>(); } } //Download all URLs foreach (String startURL in si.StartURLs) { try { String currURL = startURL; while (currURL != null) { HtmlDocument doc = new HtmlDocument(); var currHTML = URLDownloader.GetHtmlOfURL(currURL); doc.LoadHtml(currHTML); if (!String.IsNullOrEmpty(si.PageExtractionXpath)) { var links = doc.DocumentNode.SelectNodes(si.PageExtractionXpath); foreach (HtmlNode lnk in links) { try { String pageLink = lnk.Attributes["href"].Value; // var htmlstr = URLDownloader.GetHtmlOfURL(URLDownloader.UrlFixIfRelative(pageLink, currURL)); foreach (string attr in si.attributeExtraction.Keys) { var newURL = URLDownloader.UrlFixIfRelative(pageLink, currURL); if (SiteLinks[si.SiteName][attr].Contains(newURL)) { continue; } //HtmlDocument innerDoc = new HtmlDocument(); //innerDoc.LoadHtml(htmlstr); //SiteDocuments[si.SiteName][attr].Add(pageLink, innerDoc.DocumentNode); SiteLinks[si.SiteName][attr].Add(newURL); if (--leftPages <= 0) { break; } } if (leftPages <= 0) { break; } } catch (Exception e) { // Console.WriteLine(e.StackTrace); } } } else { foreach (string attr in si.attributeExtraction.Keys) { try { if (SiteLinks[si.SiteName][attr].Contains(currURL)) { continue; } // HtmlDocument innerDoc = new HtmlDocument(); // innerDoc.LoadHtml(currHTML); // SiteDocuments[si.SiteName][attr].Add(currURL, innerDoc.DocumentNode); SiteLinks[si.SiteName][attr].Add(currURL); if (--leftPages <= 0) { break; } } catch { break; } } } if (leftPages <= 0) { break; } //get next page String nextLink = null; try { nextLink = HttpUtility.HtmlDecode(doc.DocumentNode.SelectSingleNode(si.NextPageXPath).Attributes["href"].Value); } catch { nextLink = null; } if (nextLink != null) { nextLink = URLDownloader.UrlFixIfRelative(nextLink, currURL); } if (!currURL.ToLower().Trim().Equals(nextLink.ToLower().Trim())) { currURL = nextLink; } else { break; } } } catch { } } foreach (String attr in si.attributeExtraction.Keys) { var trainingkeys = new HashSet <String>(SiteLinks[si.SiteName][attr].Take(5)); //new HashSet<String>(SiteDocuments[si.SiteName][attr].Keys.Take(5)); var trainingDic = new Dictionary <String, HtmlNode>(); //SiteDocuments[si.SiteName][attr].Where(x => trainingkeys.Contains(x.Key)).ToDictionary(kv => kv.Key, kv => kv.Value); foreach (String lnk in trainingkeys) { trainingDic.Add(lnk, GetHtmlNode(lnk)); } var testDic = SiteDocuments[si.SiteName][attr].Where(x => !trainingkeys.Contains(x.Key)).ToDictionary(kv => kv.Key, kv => kv.Value); foreach (var lnk in trainingDic.Keys) { HtmlNode adoc = trainingDic[lnk]; try { var gt = adoc.SelectNodes(si.attributeExtraction[attr]); if (gt != null) { foreach (var targetNode in gt) { //Console.Write(":"); if (targetNode.Attributes.Contains("userselected")) { targetNode.SetAttributeValue("userselected", "yes"); } else { targetNode.Attributes.Add("userselected", "yes"); } } } } catch { } MD5 md5 = MD5.Create(); if (!File.Exists("huge/" + si.SiteName + "/training/" + attr + "/" + getMD5(lnk) + ".html")) { Directory.CreateDirectory("huge/" + si.SiteName + "/training/" + attr); File.WriteAllText("huge/" + si.SiteName + "/training/" + attr + "/" + getMD5(lnk) + ".html", adoc.InnerHtml); } } DomPool.LoadDocuments(trainingDic); //DomPool.LoadTestDocuments(); DomPool.Initiate(new HashSet <string>(trainingDic.Keys)); DomPool.ExtractAllFeatures(); // Run code DecisionNode dn = new DecisionNode(); dn.InitialNodeSet = new HashSet <HtmlNode>(DomPool.TargetNodes.Union(DomPool.NonTargetNodes)); dn.SelectedNegative = new HashSet <HtmlNode>(DomPool.NonTargetNodes.Except(DomPool.TargetNodesPrecision)); dn.SelectedPositive = new HashSet <HtmlNode>(DomPool.TargetNodes); dn.FeatureSet = new HashSet <Feature>(); dn.CalculateEntropy(); DecisionTreeLearning.RecursiveTreeImprovement(dn); var xpath = XpathTools.GenerateAForgivingXpath(dn); var xpathNonForgiving = XpathTools.DecisionTreeToXpath(dn, new HashSet <Feature>(), 1); xpathNonForgiving = "//*" + (xpathNonForgiving.Equals("") ? "" : ("[" + xpathNonForgiving + "]")); XpathAlignment model = new XpathAlignment(); model.LearnModel(); var alignmentXpath = model.xpath; CheckOnTest(new HashSet <string>(SiteLinks[si.SiteName][attr].Except(trainingkeys)), xpath, si.attributeExtraction[attr], si.SiteName, attr, "ForgivingXP"); CheckOnTest(new HashSet <string>(SiteLinks[si.SiteName][attr].Except(trainingkeys)), alignmentXpath, si.attributeExtraction[attr], si.SiteName, attr, "Alignment"); } }finally{ } SiteDocuments.Remove(si.SiteName); } Console.ReadLine(); }
public void LearnModel() { Init(); foreach (Feature currFeature in DomPool.SelectorFeatures) { String featureString = currFeature.ToString(); HashSet <HtmlNode> resNodes = DomPool.RunXpathQuery(featureString); foreach (HtmlNode nd in resNodes) { if (!allNodes.Contains(nd)) { continue; } nodeFeatures[nd].Add(featureString); } } FastVector fvWekaAttributes = GetDataSetAtts(); Instances trainingSet = new Instances("TS", fvWekaAttributes, 100); trainingSet.setClassIndex(fvWekaAttributes.size() - 1); foreach (HtmlNode currNode in allNodes) { Instance item = new SparseInstance(fvWekaAttributes.size()); for (int i = 0; i < fvWekaAttributes.size() - 1; i++) { weka.core.Attribute currFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(i); if (nodeFeatures[currNode].Contains(currFeature.name())) { item.setValue(currFeature, 1); } else { item.setValue(currFeature, 0); } } //set the class weka.core.Attribute classFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(fvWekaAttributes.size() - 1); item.setValue(classFeature, (DomPool.TargetNodes.Contains(currNode)?"yes":"no")); item.setDataset(trainingSet); if (DomPool.TargetNodes.Contains(currNode)) { for (int t = 0; t < (DomPool.NonTargetNodes.Count() / DomPool.TargetNodes.Count()); t++) { trainingSet.add(new SparseInstance(item)); } } else { trainingSet.add(item); } } String[] options = new String[2]; options[0] = "-C"; // unpruned tree options[1] = "0.1"; J48 tree = new J48(); // new instance of tree tree.setOptions(options); // set the options tree.buildClassifier(trainingSet); // build classifier //save the resulting classifier classifierTree = tree; Reader treeDot = new StringReader(tree.graph()); TreeBuild treeBuild = new TreeBuild(); Node treeRoot = treeBuild.create(treeDot); FeaturesUsed = getTreeFeatures(treeRoot); }
public void LearnModel() { Init(); foreach (Feature currFeature in DomPool.SelectorFeatures) { String featureString = currFeature.ToString(); HashSet <HtmlNode> resNodes = DomPool.RunXpathQuery(featureString); foreach (HtmlNode nd in resNodes) { if (!allNodes.Contains(nd)) { continue; } nodeFeatures[nd].Add(featureString); } } FastVector fvWekaAttributes = GetDataSetAtts(); Instances trainingSet = new Instances("TS", fvWekaAttributes, 10); trainingSet.setClassIndex(fvWekaAttributes.size() - 1); foreach (HtmlNode currNode in allNodes) { Instance item = new SparseInstance(fvWekaAttributes.size()); for (int i = 0; i < fvWekaAttributes.size() - 1; i++) { weka.core.Attribute currFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(i); if (nodeFeatures[currNode].Contains(currFeature.name())) { item.setValue(currFeature, 1); } else { item.setValue(currFeature, 0); } } //set the class weka.core.Attribute classFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(fvWekaAttributes.size() - 1); item.setValue(classFeature, (DomPool.TargetNodes.Contains(currNode)?"yes":"no")); item.setDataset(trainingSet); if (DomPool.TargetNodes.Contains(currNode)) { for (int t = 0; t < (DomPool.NonTargetNodes.Count() / DomPool.TargetNodes.Count()); t++) { trainingSet.add(new SparseInstance(item)); } } else { trainingSet.add(item); } } //String[] options = new String[2]; //options = new string[] { "-C", "0.05" }; // unpruned tree NaiveBayes cls = new NaiveBayes(); // new instance of tree //cls.setOptions(weka.core.Utils.splitOptions("-C 1.0 -L 0.0010 -P 1.0E-12 -N 0 -V -1 -W 1 -K \"weka.classifiers.functions.supportVector.PolyKernel -C 250007 -E 1.0\"")); //cls.setOptions(options); // set the options cls.buildClassifier(trainingSet); // build classifier //save the resulting classifier classifier = cls; // Reader treeDot = new StringReader(tree.graph()); // TreeBuild treeBuild = new TreeBuild(); // Node treeRoot = treeBuild.create(treeDot); FeaturesUsed = new HashSet <string>(); foreach (Feature f in DomPool.SelectorFeatures) { FeaturesUsed.Add(f.ToString()); } }
public HashSet <HtmlNode> RunOnTestSeenSet() { return(DomPool.TESTSeenRunXpathQuery(xpath)); }
/// <summary> /// Initialize DOM with a selector string. /// </summary> /// <param name="selector">DOM Selector string.</param> public static ISyncDom _(string selector) => DomPool.GetDom(selector);