Beispiel #1
0
        public static HashSet <Feature> GeneralizeTextFeatures(String text)
        {
            HashSet <Feature> res = new HashSet <Feature>();
            Feature           f   = new Feature();

            f.type    = Feature.FeatureType.Selector;
            f.feature = new List <string>()
            {
                "text()='" + XpathTools.EscapeString(text.Trim()) + "'"
            };
            f.cost = FeatureCosts.TEXT_EQUAL;
            res.Add(f);
            Regex reg = new Regex("(?=([A-Z\\-\\s]))");

            string[] splitted = reg.Split(XpathTools.EscapeString(text));
            foreach (var split in splitted)
            {
                var curr = split.Replace("-", "").Trim();
                if (curr.Length < 2)
                {
                    continue;
                }
                Feature f_contains = new Feature();
                f_contains.type    = Feature.FeatureType.Selector;
                f_contains.feature = new List <string>()
                {
                    "contains(text(),'" + curr + "')"
                };
                f_contains.cost = FeatureCosts.TEXT_CONTAINS;
                res.Add(f_contains);
            }
            return(res);
        }
Beispiel #2
0
        public static HashSet <Feature> GeneralizeAttributeFeatures(HtmlAttribute att)
        {
            if (att.Name.Equals(DomPool.selectionAttribute) || att.Name.Equals(DomPool.optionalSelectionAttribute))
            {
                return(new HashSet <Feature>());
            }
            HashSet <Feature> res = new HashSet <Feature>();
            //BASE att name existance condition
            Feature f = new Feature();

            f.type    = Feature.FeatureType.Selector;
            f.feature = new List <string>()
            {
                "@" + att.Name
            };
            f.cost = FeatureCosts.ATT_BASE;
            res.Add(f);

            if (att.Value.Trim().Length >= 1)
            {
                f         = new Feature();
                f.type    = Feature.FeatureType.Selector;
                f.feature = new List <string>()
                {
                    "@" + att.Name + "='" + XpathTools.EscapeString(att.Value.Trim()) + "'"
                };
                f.cost = FeatureCosts.ATT_EQUAL;
                res.Add(f);

                f         = new Feature();
                f.type    = Feature.FeatureType.Selector;
                f.feature = new List <string>()
                {
                    "@*='" + XpathTools.EscapeString(att.Value.Trim()) + "'"
                };
                f.cost = FeatureCosts.ATT_ANY_EQUAL;
                res.Add(f);
            }

            Regex reg = new Regex("(?=([A-Z\\-\\s/\\?\\=_]))");

            string[] splitted = reg.Split(XpathTools.EscapeString(att.Value));
            //fix, it must keep the letters and remove the symboles.
            foreach (var split in splitted)
            {
                string curr = split.Replace("-", "").Trim();
                if (curr.Length < 2)
                {
                    continue;
                }
                Feature f_contains = new Feature();
                f_contains.type    = Feature.FeatureType.Selector;
                f_contains.feature = new List <string>()
                {
                    "@" + att.Name + "[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'),'" + curr + "')]"
                };
                f_contains.cost = FeatureCosts.ATT_CONTAINS;
                res.Add(f_contains);

                Feature f_any_contains = new Feature();
                f_any_contains.type    = Feature.FeatureType.Selector;
                f_any_contains.feature = new List <string>()
                {
                    "@*[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'),'" + curr + "')]"
                };
                f_any_contains.cost = FeatureCosts.ATT_ANY_CONTAINS;
                res.Add(f_any_contains);
            }

            for (int i = 0; i < splitted.Length - 1; i++)
            {
                var    split = splitted[i] + splitted[i + 1];
                string curr  = split.Replace("-", "").Trim();
                if (curr.Length < 2)
                {
                    continue;
                }
                Feature f_contains = new Feature();
                f_contains.type    = Feature.FeatureType.Selector;
                f_contains.feature = new List <string>()
                {
                    "@" + att.Name + "[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'),'" + curr + "')]"
                };
                f_contains.cost = FeatureCosts.ATT_CONTAINS + 0.05;
                res.Add(f_contains);

                Feature f_any_contains = new Feature();
                f_any_contains.type    = Feature.FeatureType.Selector;
                f_any_contains.feature = new List <string>()
                {
                    "@*[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'),'" + curr + "')]"
                };
                f_any_contains.cost = FeatureCosts.ATT_ANY_CONTAINS + 0.05;
                res.Add(f_any_contains);
            }

            return(res);
        }