public HashSet <HtmlNode> RunOnTestSeenSet() { HashSet <HtmlNode> classifierSelectedNodes = new HashSet <HtmlNode>(); InitTestSeen(); foreach (string featureString in FeaturesUsed) { HashSet <HtmlNode> resNodes = DomPool.TESTSeenRunXpathQuery(useNormalPerformanceQUERY(featureString)); foreach (HtmlNode nd in resNodes) { if (!testSeenAllNodes.Contains(nd)) { continue; } testSeenNodeFeatures[nd].Add(featureString); } } FastVector fvWekaAttributes = GetDataSetAtts(); Instances testSet = new Instances("TestSeenSet", fvWekaAttributes, 10); testSet.setClassIndex(fvWekaAttributes.size() - 1); foreach (HtmlNode currNode in testSeenAllNodes) { Instance item = new SparseInstance(fvWekaAttributes.size()); for (int i = 0; i < fvWekaAttributes.size() - 1; i++) { weka.core.Attribute currFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(i); if (testSeenNodeFeatures[currNode].Contains(currFeature.name())) { item.setValue(currFeature, 1); } else { item.setValue(currFeature, 0); } } //set the class weka.core.Attribute classFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(fvWekaAttributes.size() - 1); //string rightVal = DomPool.TargetNodes.Contains(currNode) ? "yes" : "no"; item.setDataset(testSet); double classifierdv = classifierTree.classifyInstance(item); string classifierVal = classFeature.value((int)classifierdv); if (classifierVal.Equals("yes")) { classifierSelectedNodes.Add(currNode); } testSet.add(item); } return(classifierSelectedNodes); }
private Instance createSingleWhoInstance(FastVector fvWho, Token candidate) { //first word-n attribute number int wordsBeforeFirstAttributeNumber = 6; //first pos-n attribute number int posBeforeFirstAttributeNumber = wordsBeforeFirstAttributeNumber + whoWordsBefore + whoWordsAfter; //word+1 attribute number int wordsAfterFirstAttributeNumber = wordsBeforeFirstAttributeNumber + whoWordsBefore; //pos+1 attribute number int posAfterFirstAttributeNumber = posBeforeFirstAttributeNumber + whoWordsBefore; int totalAttributeCount = wordsBeforeFirstAttributeNumber + whoWordsBefore * 2 + whoWordsAfter * 2 + 1; Instance whoCandidate = new DenseInstance(totalAttributeCount); whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(0), candidate.Value); whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(1), candidate.Value.Split(' ').Count()); whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(2), candidate.Sentence); whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(3), candidate.Position); double sentenceStartProximity = -1; foreach (List <Token> tokenList in segregatedArticleCurrent) { if (tokenList.Count > 0 && tokenList[0].Sentence == candidate.Sentence) { sentenceStartProximity = (double)(candidate.Position - tokenList[0].Position) / (double)tokenList.Count; break; } } if (sentenceStartProximity > -1) { whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(4), sentenceStartProximity); } whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(5), candidate.Frequency); for (int i = whoWordsBefore; i > 0; i--) { if (candidate.Position - i - 1 >= 0) { whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(whoWordsBefore - i + wordsBeforeFirstAttributeNumber), articleCurrent[candidate.Position - i - 1].Value); if (articleCurrent[candidate.Position - i - 1].PartOfSpeech != null) { whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(whoWordsBefore - i + posBeforeFirstAttributeNumber), articleCurrent[candidate.Position - i - 1].PartOfSpeech); } } } for (int i = 0; i < whoWordsAfter; i++) { if (candidate.Position + i < articleCurrent.Count) { whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(wordsAfterFirstAttributeNumber + i), articleCurrent[candidate.Position + i].Value); if (articleCurrent[candidate.Position + i].PartOfSpeech != null) { whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(posAfterFirstAttributeNumber + i), articleCurrent[candidate.Position + i].PartOfSpeech); } } } return(whoCandidate); }
private Instance createSingleWhyInstance(FastVector fvWhy, Token candidate) { //first word-n attribute number int wordsBeforeFirstAttributeNumber = 7; //first pos-n attribute number int posBeforeFirstAttributeNumber = wordsBeforeFirstAttributeNumber + whyWordsBefore + whyWordsAfter; //word+1 attribute number int wordsAfterFirstAttributeNumber = wordsBeforeFirstAttributeNumber + whyWordsBefore; //pos+1 attribute number int posAfterFirstAttributeNumber = posBeforeFirstAttributeNumber + whyWordsBefore; int totalAttributeCount = wordsBeforeFirstAttributeNumber + whyWordsBefore * 2 + whyWordsAfter * 2 + 1; Instance whyCandidate = new DenseInstance(totalAttributeCount); whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(0), candidate.Value); whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(1), candidate.Value.Split(' ').Count()); whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(2), candidate.Sentence); whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(3), candidate.Score); whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(4), candidate.NumWho); whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(5), candidate.NumWhen); whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(6), candidate.NumWhere); for (int i = whyWordsBefore; i > 0; i--) { if (candidate.Position - i - 1 >= 0) { whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(whyWordsBefore - i + wordsBeforeFirstAttributeNumber), articleCurrent[candidate.Position - i - 1].Value); if (articleCurrent[candidate.Position - i - 1].PartOfSpeech != null) { whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(whyWordsBefore - i + posBeforeFirstAttributeNumber), articleCurrent[candidate.Position - i - 1].PartOfSpeech); } } } for (int i = 0; i < whyWordsAfter; i++) { if (candidate.Position + i < articleCurrent.Count) { whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(wordsAfterFirstAttributeNumber + i), articleCurrent[candidate.Position + i].Value); if (articleCurrent[candidate.Position + i].PartOfSpeech != null) { whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(posAfterFirstAttributeNumber + i), articleCurrent[candidate.Position + i].PartOfSpeech); } } } return(whyCandidate); }
public void LearnModel() { Init(); foreach (Feature currFeature in DomPool.SelectorFeatures) { String featureString = currFeature.ToString(); HashSet <HtmlNode> resNodes = DomPool.RunXpathQuery(featureString); foreach (HtmlNode nd in resNodes) { if (!allNodes.Contains(nd)) { continue; } nodeFeatures[nd].Add(featureString); } } FastVector fvWekaAttributes = GetDataSetAtts(); Instances trainingSet = new Instances("TS", fvWekaAttributes, 100); trainingSet.setClassIndex(fvWekaAttributes.size() - 1); foreach (HtmlNode currNode in allNodes) { Instance item = new SparseInstance(fvWekaAttributes.size()); for (int i = 0; i < fvWekaAttributes.size() - 1; i++) { weka.core.Attribute currFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(i); if (nodeFeatures[currNode].Contains(currFeature.name())) { item.setValue(currFeature, 1); } else { item.setValue(currFeature, 0); } } //set the class weka.core.Attribute classFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(fvWekaAttributes.size() - 1); item.setValue(classFeature, (DomPool.TargetNodes.Contains(currNode)?"yes":"no")); item.setDataset(trainingSet); if (DomPool.TargetNodes.Contains(currNode)) { for (int t = 0; t < (DomPool.NonTargetNodes.Count() / DomPool.TargetNodes.Count()); t++) { trainingSet.add(new SparseInstance(item)); } } else { trainingSet.add(item); } } String[] options = new String[2]; options[0] = "-C"; // unpruned tree options[1] = "0.1"; J48 tree = new J48(); // new instance of tree tree.setOptions(options); // set the options tree.buildClassifier(trainingSet); // build classifier //save the resulting classifier classifierTree = tree; Reader treeDot = new StringReader(tree.graph()); TreeBuild treeBuild = new TreeBuild(); Node treeRoot = treeBuild.create(treeDot); FeaturesUsed = getTreeFeatures(treeRoot); }
public void LearnModel() { Init(); foreach (Feature currFeature in DomPool.SelectorFeatures) { String featureString = currFeature.ToString(); HashSet <HtmlNode> resNodes = DomPool.RunXpathQuery(featureString); foreach (HtmlNode nd in resNodes) { if (!allNodes.Contains(nd)) { continue; } nodeFeatures[nd].Add(featureString); } } FastVector fvWekaAttributes = GetDataSetAtts(); Instances trainingSet = new Instances("TS", fvWekaAttributes, 10); trainingSet.setClassIndex(fvWekaAttributes.size() - 1); foreach (HtmlNode currNode in allNodes) { Instance item = new SparseInstance(fvWekaAttributes.size()); for (int i = 0; i < fvWekaAttributes.size() - 1; i++) { weka.core.Attribute currFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(i); if (nodeFeatures[currNode].Contains(currFeature.name())) { item.setValue(currFeature, 1); } else { item.setValue(currFeature, 0); } } //set the class weka.core.Attribute classFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(fvWekaAttributes.size() - 1); item.setValue(classFeature, (DomPool.TargetNodes.Contains(currNode)?"yes":"no")); item.setDataset(trainingSet); if (DomPool.TargetNodes.Contains(currNode)) { for (int t = 0; t < (DomPool.NonTargetNodes.Count() / DomPool.TargetNodes.Count()); t++) { trainingSet.add(new SparseInstance(item)); } } else { trainingSet.add(item); } } //String[] options = new String[2]; //options = new string[] { "-C", "0.05" }; // unpruned tree NaiveBayes cls = new NaiveBayes(); // new instance of tree //cls.setOptions(weka.core.Utils.splitOptions("-C 1.0 -L 0.0010 -P 1.0E-12 -N 0 -V -1 -W 1 -K \"weka.classifiers.functions.supportVector.PolyKernel -C 250007 -E 1.0\"")); //cls.setOptions(options); // set the options cls.buildClassifier(trainingSet); // build classifier //save the resulting classifier classifier = cls; // Reader treeDot = new StringReader(tree.graph()); // TreeBuild treeBuild = new TreeBuild(); // Node treeRoot = treeBuild.create(treeDot); FeaturesUsed = new HashSet <string>(); foreach (Feature f in DomPool.SelectorFeatures) { FeaturesUsed.Add(f.ToString()); } }