public void SaveToDatabase(string name) { PageMap pm; if (string.IsNullOrEmpty(name)) { int secondSlash = _baseUrl.IndexOf("/") + 1; string domain = _baseUrl.Substring(secondSlash + 1); name = domain.Substring(0, domain.IndexOf("/")); } if (!_db.PageMaps.Any(m => m.Name == name && m.BaseURL == _baseUrl)) { pm = new PageMap(); pm.PageMapID = Guid.NewGuid(); pm.Name = name; pm.BaseURL = _baseUrl; _db.PageMaps.InsertOnSubmit(pm); } else { pm = _db.PageMaps.Single(m => m.Name == name && m.BaseURL == _baseUrl); _db.Nodes.DeleteAllOnSubmit(_db.Nodes.Where(n => n.PageMapID == pm.PageMapID)); } int index = -1; foreach (var node in _nodeMap) { index++; Node newNode = new Node(); newNode.PageMapID = pm.PageMapID; newNode.NodeIndex = index; newNode.NodeName = node.Name; newNode.NodeID = node.Id; if (node.Attributes.Any(a => a.Name == "class")) { newNode.NodeClass = node.Attributes.Single(a => a.Name == "class").Value; } _db.Nodes.InsertOnSubmit(newNode); } _maxIndex = pm.MaxIndex = index; _db.SubmitChanges(); }
public static List<string> GetRelevantTextFromDocumentUsingMap(HtmlDocument document, PageMap map) { HtmlNode documentNode = document.DocumentNode.SelectSingleNode("//html"); List<HtmlNode> documentNodesOfInterest = new List<HtmlNode>(); List<string> textItems = new List<string>(); foreach (Node mapNode in map.Nodes.OrderBy(n => n.NodeIndex)) { // If we're at the start of the map, find where to go next if (mapNode.NodeIndex < map.MaxIndex) { // Find out which child nodes have the right name and ID. documentNodesOfInterest = documentNode.ChildNodes.Where(n => n.Name == mapNode.NodeName && n.Id == mapNode.NodeID).ToList(); // If there's only 1, use that. if (documentNodesOfInterest.Count == 1) { documentNode = documentNodesOfInterest.First(); continue; } // If there are more, find out which ones have the right class else { foreach (HtmlNode docNode in documentNodesOfInterest) { foreach (HtmlAttribute a in docNode.Attributes.AttributesWithName("class")) { if (a.Value == mapNode.NodeClass) { documentNode = docNode; break; } } if (documentNode == docNode) { break; } } // If none had the right class, they won't have been put in documentNode if (!documentNodesOfInterest.Contains(documentNode)) { // Try to match on the ID of a child node in the map foreach (HtmlNode docNode in documentNodesOfInterest) { Node nextNode = map.Nodes.Where(n => n.NodeIndex > mapNode.NodeIndex && !string.IsNullOrEmpty(n.NodeID)).OrderBy(n => n.NodeIndex).FirstOrDefault(); if (nextNode != null && docNode.InnerHtml.Contains(nextNode.NodeID)) { documentNode = docNode; break; } } // If we still haven't found anything, it's usually because the // loop has hit a listitem. Loop through from here, see what we // get and don't bother going deeper. if (!documentNodesOfInterest.Contains(documentNode)) { foreach (HtmlNode node in documentNodesOfInterest.Where(n => n.Name == mapNode.NodeName && n.Id == mapNode.NodeID)) { { textItems.Add(node.InnerText.Trim()); } } break; } } } } // If we're at the end of the map, start delving into the text else { foreach (HtmlNode node in documentNode.ChildNodes.Where(n => n.Name == mapNode.NodeName && n.Id == mapNode.NodeID)) { // Without class information, the next bit is useless. Grab any old text. if (mapNode.NodeClass == null) { { textItems.Add(node.InnerText.Trim()); } } else { // If we have class information, only pull the matching ones. foreach (HtmlAttribute a in node.Attributes.AttributesWithName("class")) { string className = map.Nodes.Single(n => n.NodeIndex == map.MaxIndex).NodeClass; if (a.Value == className) { textItems.Add(node.InnerText.Trim()); } } } } } } return textItems; }
partial void UpdatePageMap(PageMap instance);
partial void DeletePageMap(PageMap instance);
partial void InsertPageMap(PageMap instance);