public HashSet <HtmlNode> RunOnTestSeenSet()
        {
            HashSet <HtmlNode> classifierSelectedNodes = new HashSet <HtmlNode>();

            InitTestSeen();
            foreach (string featureString in FeaturesUsed)
            {
                HashSet <HtmlNode> resNodes = DomPool.TESTSeenRunXpathQuery(useNormalPerformanceQUERY(featureString));
                foreach (HtmlNode nd in resNodes)
                {
                    if (!testSeenAllNodes.Contains(nd))
                    {
                        continue;
                    }
                    testSeenNodeFeatures[nd].Add(featureString);
                }
            }

            FastVector fvWekaAttributes = GetDataSetAtts();
            Instances  testSet          = new Instances("TestSeenSet", fvWekaAttributes, 10);

            testSet.setClassIndex(fvWekaAttributes.size() - 1);

            foreach (HtmlNode currNode in testSeenAllNodes)
            {
                Instance item = new SparseInstance(fvWekaAttributes.size());

                for (int i = 0; i < fvWekaAttributes.size() - 1; i++)
                {
                    weka.core.Attribute currFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(i);
                    if (testSeenNodeFeatures[currNode].Contains(currFeature.name()))
                    {
                        item.setValue(currFeature, 1);
                    }
                    else
                    {
                        item.setValue(currFeature, 0);
                    }
                }

                //set the class
                weka.core.Attribute classFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(fvWekaAttributes.size() - 1);
                //string rightVal = DomPool.TargetNodes.Contains(currNode) ? "yes" : "no";
                item.setDataset(testSet);



                double classifierdv  = classifierTree.classifyInstance(item);
                string classifierVal = classFeature.value((int)classifierdv);

                if (classifierVal.Equals("yes"))
                {
                    classifierSelectedNodes.Add(currNode);
                }

                testSet.add(item);
            }

            return(classifierSelectedNodes);
        }
Beispiel #2
0
        private Instance createSingleWhoInstance(FastVector fvWho, Token candidate)
        {
            //first word-n attribute number
            int wordsBeforeFirstAttributeNumber = 6;
            //first pos-n attribute number
            int posBeforeFirstAttributeNumber = wordsBeforeFirstAttributeNumber + whoWordsBefore + whoWordsAfter;
            //word+1 attribute number
            int wordsAfterFirstAttributeNumber = wordsBeforeFirstAttributeNumber + whoWordsBefore;
            //pos+1 attribute number
            int posAfterFirstAttributeNumber = posBeforeFirstAttributeNumber + whoWordsBefore;

            int totalAttributeCount = wordsBeforeFirstAttributeNumber + whoWordsBefore * 2 + whoWordsAfter * 2 + 1;

            Instance whoCandidate = new DenseInstance(totalAttributeCount);

            whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(0), candidate.Value);
            whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(1), candidate.Value.Split(' ').Count());
            whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(2), candidate.Sentence);
            whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(3), candidate.Position);
            double sentenceStartProximity = -1;

            foreach (List <Token> tokenList in segregatedArticleCurrent)
            {
                if (tokenList.Count > 0 && tokenList[0].Sentence == candidate.Sentence)
                {
                    sentenceStartProximity = (double)(candidate.Position - tokenList[0].Position) / (double)tokenList.Count;
                    break;
                }
            }
            if (sentenceStartProximity > -1)
            {
                whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(4), sentenceStartProximity);
            }
            whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(5), candidate.Frequency);

            for (int i = whoWordsBefore; i > 0; i--)
            {
                if (candidate.Position - i - 1 >= 0)
                {
                    whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(whoWordsBefore - i + wordsBeforeFirstAttributeNumber), articleCurrent[candidate.Position - i - 1].Value);
                    if (articleCurrent[candidate.Position - i - 1].PartOfSpeech != null)
                    {
                        whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(whoWordsBefore - i + posBeforeFirstAttributeNumber), articleCurrent[candidate.Position - i - 1].PartOfSpeech);
                    }
                }
            }
            for (int i = 0; i < whoWordsAfter; i++)
            {
                if (candidate.Position + i < articleCurrent.Count)
                {
                    whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(wordsAfterFirstAttributeNumber + i), articleCurrent[candidate.Position + i].Value);
                    if (articleCurrent[candidate.Position + i].PartOfSpeech != null)
                    {
                        whoCandidate.setValue((weka.core.Attribute)fvWho.elementAt(posAfterFirstAttributeNumber + i), articleCurrent[candidate.Position + i].PartOfSpeech);
                    }
                }
            }
            return(whoCandidate);
        }
Beispiel #3
0
        private Instance createSingleWhyInstance(FastVector fvWhy, Token candidate)
        {
            //first word-n attribute number
            int wordsBeforeFirstAttributeNumber = 7;
            //first pos-n attribute number
            int posBeforeFirstAttributeNumber = wordsBeforeFirstAttributeNumber + whyWordsBefore + whyWordsAfter;
            //word+1 attribute number
            int wordsAfterFirstAttributeNumber = wordsBeforeFirstAttributeNumber + whyWordsBefore;
            //pos+1 attribute number
            int posAfterFirstAttributeNumber = posBeforeFirstAttributeNumber + whyWordsBefore;

            int totalAttributeCount = wordsBeforeFirstAttributeNumber + whyWordsBefore * 2 + whyWordsAfter * 2 + 1;

            Instance whyCandidate = new DenseInstance(totalAttributeCount);

            whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(0), candidate.Value);
            whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(1), candidate.Value.Split(' ').Count());
            whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(2), candidate.Sentence);
            whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(3), candidate.Score);
            whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(4), candidate.NumWho);
            whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(5), candidate.NumWhen);
            whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(6), candidate.NumWhere);
            for (int i = whyWordsBefore; i > 0; i--)
            {
                if (candidate.Position - i - 1 >= 0)
                {
                    whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(whyWordsBefore - i + wordsBeforeFirstAttributeNumber), articleCurrent[candidate.Position - i - 1].Value);
                    if (articleCurrent[candidate.Position - i - 1].PartOfSpeech != null)
                    {
                        whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(whyWordsBefore - i + posBeforeFirstAttributeNumber), articleCurrent[candidate.Position - i - 1].PartOfSpeech);
                    }
                }
            }
            for (int i = 0; i < whyWordsAfter; i++)
            {
                if (candidate.Position + i < articleCurrent.Count)
                {
                    whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(wordsAfterFirstAttributeNumber + i), articleCurrent[candidate.Position + i].Value);
                    if (articleCurrent[candidate.Position + i].PartOfSpeech != null)
                    {
                        whyCandidate.setValue((weka.core.Attribute)fvWhy.elementAt(posAfterFirstAttributeNumber + i), articleCurrent[candidate.Position + i].PartOfSpeech);
                    }
                }
            }
            return(whyCandidate);
        }
        public void LearnModel()
        {
            Init();
            foreach (Feature currFeature in DomPool.SelectorFeatures)
            {
                String             featureString = currFeature.ToString();
                HashSet <HtmlNode> resNodes      = DomPool.RunXpathQuery(featureString);
                foreach (HtmlNode nd in resNodes)
                {
                    if (!allNodes.Contains(nd))
                    {
                        continue;
                    }
                    nodeFeatures[nd].Add(featureString);
                }
            }

            FastVector fvWekaAttributes = GetDataSetAtts();
            Instances  trainingSet      = new Instances("TS", fvWekaAttributes, 100);

            trainingSet.setClassIndex(fvWekaAttributes.size() - 1);

            foreach (HtmlNode currNode in allNodes)
            {
                Instance item = new SparseInstance(fvWekaAttributes.size());

                for (int i = 0; i < fvWekaAttributes.size() - 1; i++)
                {
                    weka.core.Attribute currFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(i);
                    if (nodeFeatures[currNode].Contains(currFeature.name()))
                    {
                        item.setValue(currFeature, 1);
                    }
                    else
                    {
                        item.setValue(currFeature, 0);
                    }
                }

                //set the class
                weka.core.Attribute classFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(fvWekaAttributes.size() - 1);
                item.setValue(classFeature, (DomPool.TargetNodes.Contains(currNode)?"yes":"no"));
                item.setDataset(trainingSet);
                if (DomPool.TargetNodes.Contains(currNode))
                {
                    for (int t = 0; t < (DomPool.NonTargetNodes.Count() / DomPool.TargetNodes.Count()); t++)
                    {
                        trainingSet.add(new SparseInstance(item));
                    }
                }
                else
                {
                    trainingSet.add(item);
                }
            }

            String[] options = new String[2];
            options[0] = "-C";                 // unpruned tree
            options[1] = "0.1";
            J48 tree = new J48();              // new instance of tree

            tree.setOptions(options);          // set the options
            tree.buildClassifier(trainingSet); // build classifier
            //save the resulting classifier
            classifierTree = tree;

            Reader    treeDot   = new StringReader(tree.graph());
            TreeBuild treeBuild = new TreeBuild();
            Node      treeRoot  = treeBuild.create(treeDot);

            FeaturesUsed = getTreeFeatures(treeRoot);
        }
Beispiel #5
0
        public void LearnModel()
        {
            Init();
            foreach (Feature currFeature in DomPool.SelectorFeatures)
            {
                String             featureString = currFeature.ToString();
                HashSet <HtmlNode> resNodes      = DomPool.RunXpathQuery(featureString);
                foreach (HtmlNode nd in resNodes)
                {
                    if (!allNodes.Contains(nd))
                    {
                        continue;
                    }
                    nodeFeatures[nd].Add(featureString);
                }
            }
            FastVector fvWekaAttributes = GetDataSetAtts();
            Instances  trainingSet      = new Instances("TS", fvWekaAttributes, 10);

            trainingSet.setClassIndex(fvWekaAttributes.size() - 1);

            foreach (HtmlNode currNode in allNodes)
            {
                Instance item = new SparseInstance(fvWekaAttributes.size());

                for (int i = 0; i < fvWekaAttributes.size() - 1; i++)
                {
                    weka.core.Attribute currFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(i);
                    if (nodeFeatures[currNode].Contains(currFeature.name()))
                    {
                        item.setValue(currFeature, 1);
                    }
                    else
                    {
                        item.setValue(currFeature, 0);
                    }
                }

                //set the class
                weka.core.Attribute classFeature = (weka.core.Attribute)fvWekaAttributes.elementAt(fvWekaAttributes.size() - 1);
                item.setValue(classFeature, (DomPool.TargetNodes.Contains(currNode)?"yes":"no"));
                item.setDataset(trainingSet);
                if (DomPool.TargetNodes.Contains(currNode))
                {
                    for (int t = 0; t < (DomPool.NonTargetNodes.Count() / DomPool.TargetNodes.Count()); t++)
                    {
                        trainingSet.add(new SparseInstance(item));
                    }
                }
                else
                {
                    trainingSet.add(item);
                }
            }

            //String[] options = new String[2];
            //options = new string[] { "-C", "0.05" };            // unpruned tree
            NaiveBayes cls = new NaiveBayes();         // new instance of tree

            //cls.setOptions(weka.core.Utils.splitOptions("-C 1.0 -L 0.0010 -P 1.0E-12 -N 0 -V -1 -W 1 -K \"weka.classifiers.functions.supportVector.PolyKernel -C 250007 -E 1.0\""));
            //cls.setOptions(options);     // set the options
            cls.buildClassifier(trainingSet);  // build classifier
            //save the resulting classifier
            classifier = cls;

            //  Reader treeDot = new StringReader(tree.graph());
            //  TreeBuild treeBuild = new TreeBuild();
            //  Node treeRoot = treeBuild.create(treeDot);
            FeaturesUsed = new HashSet <string>();

            foreach (Feature f in DomPool.SelectorFeatures)
            {
                FeaturesUsed.Add(f.ToString());
            }
        }