public void Extract_Tests()
        {
            var featureExtraction = new FeatureExtraction();

            var validValues = "Button Fly,Button-End,D Ring,Double D Ring,Drawstring,Elastic,Flat Solid Buckle,Hook & Eye,J-Clip,Pull On,Round Classic Ring,Self Tie,Snap On,Snaps,Square Classic Ring,Velcro,Zipper";
            //Scenario 1: exact match
            var source1   = "Double D Ring";
            var scenario1 = featureExtraction.Extract(source1, validValues);

            Assert.AreEqual("Double D Ring", scenario1);
            //Scenario 2: source contains list of valid values
            // 2.1 source contains single matching
            // 2.2 source contains multiple matching
            // 2.2 source contains multiple matching but get only one
            // 2.3 source contains multiple matching but get only one
            // with assistance of keyword
            var source21   = "Versace Collection D Ring woven leather tote bag with golden hardware. Flat top handles with hanging logo medallion, 5.5\" drop. Removable chain shoulder strap, 18.5\" drop. Flap top with logo plaque; snap closure. Interior, cotton lining; one zip and two slip pockets. Metal feet protect bottom of bag. 10.5\"H x 14\"W x 4.5\"D; weighs 13 oz. Imported.";
            var scenario21 = featureExtraction.Extract(source21, validValues);

            Assert.AreEqual("D Ring", scenario21);
            var source221 =
                "Versace Collection D Ring woven leather tote bag with golden hardware. Flat top handles with hanging logo medallion, 5.5\" drop. Removable chain shoulder strap, 18.5\" drop. Flap top with logo plaque; snap closure. Interior, cotton lining; one zip and two slip pockets. Metal feet protect bottom of bag. 10.5\"H x 14\"W x 4.5\"D; weighs 13 oz. Imported. Elastic";
            var scenario221 = featureExtraction.Extract(source221, validValues,
                                                        3, 0);

            Assert.AreEqual("D Ring/Elastic", scenario221);

            var scenario222 = featureExtraction.Extract(source221, validValues,
                                                        3, 1);

            Assert.AreEqual("D Ring", scenario222);

            //var scenario223 = featureExtraction.Extract(source221, validValues,
            //    3, 1, "/", "closure");
            //Assert.AreEqual("Snaps", scenario223);

            //Scenario 3:

            //Scenario 4: similarity checks
            // return multiple result
            var source4   = "Versace Collection woven D Rin leather tote bag with golden hardware. Flat top handles with hanging logo medallion, 5.5\" drop. Removable chain shoulder strap, 18.5\" drop. Flap top with logo plaque; snap closure. Interior, cotton lining; one zip and two slip pockets. Metal feet protect bottom of bag. 10.5\"H x 14\"W x 4.5\"D; weighs 13 oz. Imported.";
            var scenario4 = featureExtraction.Extract(source4, validValues, 1, 0);

            var source41   = "Versace Collection woven Elasti leather tote bag with golden hardware. Flat top handles with hanging logo medallion, 5.5\" drop. Removable chain shoulder strap, 18.5\" drop. Flap top with logo plaque; snap closure. Interior, cotton lining; one zip and two slip pockets. Metal feet protect bottom of bag. 10.5\"H x 14\"W x 4.5\"D; weighs 13 oz. Imported.";
            var scenario41 = featureExtraction.Extract(source41, validValues, 1, 1);


            //var source = "Versace Collection D Ring woven leather tote bag with golden hardware. Flat top handles with hanging logo medallion, 5.5\" drop. Removable chain shoulder strap, 18.5\" drop. Flap top with logo plaque; snap closure. Interior, cotton lining; one zip and two slip pockets. Metal feet protect bottom of bag. 10.5\"H x 14\"W x 4.5\"D; weighs 13 oz. Imported.";

            //var s = new FeatureExtraction().Extract(source,validValues,3);
        }
Пример #2
0
        public static void ExtractAllFeatures()
        {
            OrderFeatures     = new HashSet <Feature>();
            SelectorFeatures  = new HashSet <Feature>();
            FeatureFrequencey = new Dictionary <string, int>();

            foreach (String docName in trainingDocsNames)
            {
                HtmlNode src = null;
                docsAndNames.TryGetValue(docName, out src);
                HashSet <HtmlNode> docNodes = new HashSet <HtmlNode>(src.SelectNodes("//*"));

                //extractFeatures
                HashSet <Feature> extacted = new HashSet <Feature>();
                if (docsFeatures.ContainsKey(docName))
                {
                    docsFeatures.TryGetValue(docName, out extacted);
                }
                else
                {
                    HashSet <HtmlNode> currNodes = new HashSet <HtmlNode>(docNodes.Intersect(TargetNodes));


                    foreach (HtmlNode curr in currNodes)
                    {
                        extacted.UnionWith(FeatureExtraction.Extract(curr));
                    }
                    docsFeatures.Add(docName, extacted);
                }

                foreach (Feature currFeature in extacted)
                {
                    if (currFeature.type.Equals(Feature.FeatureType.Order))
                    {
                        OrderFeatures.Add(currFeature);
                    }
                    else
                    {
                        SelectorFeatures.Add(currFeature);
                        int val = 0;
                        if (FeatureFrequencey.TryGetValue(currFeature.feature.First().ToLower(), out val))
                        {
                            FeatureFrequencey.Remove(currFeature.feature.First().ToLower());
                            FeatureFrequencey.Add(currFeature.feature.First().ToLower(), val + 1);
                        }
                        else
                        {
                            FeatureFrequencey.Add(currFeature.feature.First().ToLower(), 1);
                        }
                    }
                }
            }
            int thresholdNumber = 2;

            if (trainingDocsNames.Count() < 2)
            {
                thresholdNumber = 1;
            }

            var sharedFeatures = new HashSet <String>(FeatureFrequencey.Where(x => (x.Value >= thresholdNumber /*(docs.Count())*/)).Select(x => x.Key));
            HashSet <Feature> selectorFeaturesFiltered = new HashSet <Feature>();

            foreach (Feature f in SelectorFeatures)
            {
                if (sharedFeatures.Contains(f.feature.First().ToLower()))
                {
                    selectorFeaturesFiltered.Add(f);
                }
            }

            SelectorFeatures = selectorFeaturesFiltered;
        }