public void Extract_Tests() { var featureExtraction = new FeatureExtraction(); var validValues = "Button Fly,Button-End,D Ring,Double D Ring,Drawstring,Elastic,Flat Solid Buckle,Hook & Eye,J-Clip,Pull On,Round Classic Ring,Self Tie,Snap On,Snaps,Square Classic Ring,Velcro,Zipper"; //Scenario 1: exact match var source1 = "Double D Ring"; var scenario1 = featureExtraction.Extract(source1, validValues); Assert.AreEqual("Double D Ring", scenario1); //Scenario 2: source contains list of valid values // 2.1 source contains single matching // 2.2 source contains multiple matching // 2.2 source contains multiple matching but get only one // 2.3 source contains multiple matching but get only one // with assistance of keyword var source21 = "Versace Collection D Ring woven leather tote bag with golden hardware. Flat top handles with hanging logo medallion, 5.5\" drop. Removable chain shoulder strap, 18.5\" drop. Flap top with logo plaque; snap closure. Interior, cotton lining; one zip and two slip pockets. Metal feet protect bottom of bag. 10.5\"H x 14\"W x 4.5\"D; weighs 13 oz. Imported."; var scenario21 = featureExtraction.Extract(source21, validValues); Assert.AreEqual("D Ring", scenario21); var source221 = "Versace Collection D Ring woven leather tote bag with golden hardware. Flat top handles with hanging logo medallion, 5.5\" drop. Removable chain shoulder strap, 18.5\" drop. Flap top with logo plaque; snap closure. Interior, cotton lining; one zip and two slip pockets. Metal feet protect bottom of bag. 10.5\"H x 14\"W x 4.5\"D; weighs 13 oz. Imported. Elastic"; var scenario221 = featureExtraction.Extract(source221, validValues, 3, 0); Assert.AreEqual("D Ring/Elastic", scenario221); var scenario222 = featureExtraction.Extract(source221, validValues, 3, 1); Assert.AreEqual("D Ring", scenario222); //var scenario223 = featureExtraction.Extract(source221, validValues, // 3, 1, "/", "closure"); //Assert.AreEqual("Snaps", scenario223); //Scenario 3: //Scenario 4: similarity checks // return multiple result var source4 = "Versace Collection woven D Rin leather tote bag with golden hardware. Flat top handles with hanging logo medallion, 5.5\" drop. Removable chain shoulder strap, 18.5\" drop. Flap top with logo plaque; snap closure. Interior, cotton lining; one zip and two slip pockets. Metal feet protect bottom of bag. 10.5\"H x 14\"W x 4.5\"D; weighs 13 oz. Imported."; var scenario4 = featureExtraction.Extract(source4, validValues, 1, 0); var source41 = "Versace Collection woven Elasti leather tote bag with golden hardware. Flat top handles with hanging logo medallion, 5.5\" drop. Removable chain shoulder strap, 18.5\" drop. Flap top with logo plaque; snap closure. Interior, cotton lining; one zip and two slip pockets. Metal feet protect bottom of bag. 10.5\"H x 14\"W x 4.5\"D; weighs 13 oz. Imported."; var scenario41 = featureExtraction.Extract(source41, validValues, 1, 1); //var source = "Versace Collection D Ring woven leather tote bag with golden hardware. Flat top handles with hanging logo medallion, 5.5\" drop. Removable chain shoulder strap, 18.5\" drop. Flap top with logo plaque; snap closure. Interior, cotton lining; one zip and two slip pockets. Metal feet protect bottom of bag. 10.5\"H x 14\"W x 4.5\"D; weighs 13 oz. Imported."; //var s = new FeatureExtraction().Extract(source,validValues,3); }
public static void ExtractAllFeatures() { OrderFeatures = new HashSet <Feature>(); SelectorFeatures = new HashSet <Feature>(); FeatureFrequencey = new Dictionary <string, int>(); foreach (String docName in trainingDocsNames) { HtmlNode src = null; docsAndNames.TryGetValue(docName, out src); HashSet <HtmlNode> docNodes = new HashSet <HtmlNode>(src.SelectNodes("//*")); //extractFeatures HashSet <Feature> extacted = new HashSet <Feature>(); if (docsFeatures.ContainsKey(docName)) { docsFeatures.TryGetValue(docName, out extacted); } else { HashSet <HtmlNode> currNodes = new HashSet <HtmlNode>(docNodes.Intersect(TargetNodes)); foreach (HtmlNode curr in currNodes) { extacted.UnionWith(FeatureExtraction.Extract(curr)); } docsFeatures.Add(docName, extacted); } foreach (Feature currFeature in extacted) { if (currFeature.type.Equals(Feature.FeatureType.Order)) { OrderFeatures.Add(currFeature); } else { SelectorFeatures.Add(currFeature); int val = 0; if (FeatureFrequencey.TryGetValue(currFeature.feature.First().ToLower(), out val)) { FeatureFrequencey.Remove(currFeature.feature.First().ToLower()); FeatureFrequencey.Add(currFeature.feature.First().ToLower(), val + 1); } else { FeatureFrequencey.Add(currFeature.feature.First().ToLower(), 1); } } } } int thresholdNumber = 2; if (trainingDocsNames.Count() < 2) { thresholdNumber = 1; } var sharedFeatures = new HashSet <String>(FeatureFrequencey.Where(x => (x.Value >= thresholdNumber /*(docs.Count())*/)).Select(x => x.Key)); HashSet <Feature> selectorFeaturesFiltered = new HashSet <Feature>(); foreach (Feature f in SelectorFeatures) { if (sharedFeatures.Contains(f.feature.First().ToLower())) { selectorFeaturesFiltered.Add(f); } } SelectorFeatures = selectorFeaturesFiltered; }