Esempio n. 1
0
        public void SaveToDatabase(string name)
        {
            PageMap pm;

            if (string.IsNullOrEmpty(name))
            {
                int secondSlash = _baseUrl.IndexOf("/") + 1;
                string domain = _baseUrl.Substring(secondSlash + 1);
                name = domain.Substring(0, domain.IndexOf("/"));
            }


            if (!_db.PageMaps.Any(m => m.Name == name && m.BaseURL == _baseUrl))
            {
                pm = new PageMap();
                pm.PageMapID = Guid.NewGuid();
                pm.Name = name;
                pm.BaseURL = _baseUrl;
                _db.PageMaps.InsertOnSubmit(pm);
            }
            else
            {
                pm = _db.PageMaps.Single(m => m.Name == name && m.BaseURL == _baseUrl);
                _db.Nodes.DeleteAllOnSubmit(_db.Nodes.Where(n => n.PageMapID == pm.PageMapID));
            }

            int index = -1;
            foreach (var node in _nodeMap)
            {
                index++;

                Node newNode = new Node();
                newNode.PageMapID = pm.PageMapID;
                newNode.NodeIndex = index;
                newNode.NodeName = node.Name;
                newNode.NodeID = node.Id;
                if (node.Attributes.Any(a => a.Name == "class"))
                {
                    newNode.NodeClass = node.Attributes.Single(a => a.Name == "class").Value;
                }

                _db.Nodes.InsertOnSubmit(newNode);
            }

            _maxIndex = pm.MaxIndex = index;
            _db.SubmitChanges();
        }
Esempio n. 2
0
        public static List<string> GetRelevantTextFromDocumentUsingMap(HtmlDocument document, PageMap map)
        {
            HtmlNode documentNode = document.DocumentNode.SelectSingleNode("//html");
            List<HtmlNode> documentNodesOfInterest = new List<HtmlNode>();
            List<string> textItems = new List<string>();

            foreach (Node mapNode in map.Nodes.OrderBy(n => n.NodeIndex))
            {
                // If we're at the start of the map, find where to go next
                if (mapNode.NodeIndex < map.MaxIndex)
                {
                    // Find out which child nodes have the right name and ID.
                    documentNodesOfInterest = documentNode.ChildNodes.Where(n => n.Name == mapNode.NodeName && n.Id == mapNode.NodeID).ToList();

                    // If there's only 1, use that.
                    if (documentNodesOfInterest.Count == 1)
                    {
                        documentNode = documentNodesOfInterest.First();
                        continue;
                    }
                    // If there are more, find out which ones have the right class
                    else
                    {
                        foreach (HtmlNode docNode in documentNodesOfInterest)
                        {
                            foreach (HtmlAttribute a in docNode.Attributes.AttributesWithName("class"))
                            {
                                if (a.Value == mapNode.NodeClass)
                                {
                                    documentNode = docNode;
                                    break;
                                }
                            }
                            if (documentNode == docNode)
                            {
                                break;
                            }
                        }

                        // If none had the right class, they won't have been put in documentNode
                        if (!documentNodesOfInterest.Contains(documentNode))
                        {
                            // Try to match on the ID of a child node in the map
                            foreach (HtmlNode docNode in documentNodesOfInterest)
                            {
                                Node nextNode = map.Nodes.Where(n => n.NodeIndex > mapNode.NodeIndex && !string.IsNullOrEmpty(n.NodeID)).OrderBy(n => n.NodeIndex).FirstOrDefault();
                                if (nextNode != null && docNode.InnerHtml.Contains(nextNode.NodeID))
                                {
                                    documentNode = docNode;
                                    break;
                                }
                            }

                            // If we still haven't found anything, it's usually because the
                            // loop has hit a listitem. Loop through from here, see what we
                            // get and don't bother going deeper.
                            if (!documentNodesOfInterest.Contains(documentNode))
                            {
                                foreach (HtmlNode node in documentNodesOfInterest.Where(n => n.Name == mapNode.NodeName && n.Id == mapNode.NodeID))
                                {
                                    {
                                        textItems.Add(node.InnerText.Trim());
                                    }
                                }

                                break;
                            }
                        }
                    }
                }
                // If we're at the end of the map, start delving into the text
                else
                {
                    foreach (HtmlNode node in documentNode.ChildNodes.Where(n => n.Name == mapNode.NodeName && n.Id == mapNode.NodeID))
                    {
                        // Without class information, the next bit is useless. Grab any old text.
                        if (mapNode.NodeClass == null)
                        {
                            {
                                textItems.Add(node.InnerText.Trim());
                            }
                        }
                        else
                        {
                            // If we have class information, only pull the matching ones.
                            foreach (HtmlAttribute a in node.Attributes.AttributesWithName("class"))
                            {
                                string className = map.Nodes.Single(n => n.NodeIndex == map.MaxIndex).NodeClass;
                                if (a.Value == className)
                                {
                                    textItems.Add(node.InnerText.Trim());
                                }
                            }
                        }
                    }
                }
            }

            return textItems;
        }
 partial void UpdatePageMap(PageMap instance);
 partial void DeletePageMap(PageMap instance);
 partial void InsertPageMap(PageMap instance);