public PrependChild ( |
||
newChild | The node to add. May not be null. | |
return |
//========================================================================================= // 実体のないファイルの場合、画像やリンクは相対アドレスは不可能なので、絶対アドレスへと置き換える。 //========================================================================================= private string RelativeToAbsolute(string html) { try { string fn = strCurFileFullPath; fn = fn.Replace("\\", "/"); string strBaseUrl = fn; HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.LoadHtml(html); bool isExistBaseHref = false; foreach (var nodeBase in doc.DocumentNode.Descendants("base")) { if (nodeBase.Attributes["href"].Value.Length > 0) { isExistBaseHref = true; } else { } } // base hrefの指定が無いのであれば、現在の開いているファイルのディレクトリをそこにしておくことで、相対ディレクトリをフォローする。 if (!isExistBaseHref) { string basedir = System.IO.Path.GetDirectoryName(strBaseUrl); HtmlAgilityPack.HtmlNode baseNode = HtmlAgilityPack.HtmlNode.CreateNode("<base href=''>"); baseNode.Attributes["href"].Value = basedir + "\\"; // Headタグがあればそこにたす HtmlAgilityPack.HtmlNode nodeHead = doc.DocumentNode.SelectSingleNode("/html/head"); HtmlAgilityPack.HtmlNode nodeHtml = doc.DocumentNode.SelectSingleNode("/html"); if (nodeHead != null) { nodeHead.PrependChild(baseNode); } else if (nodeHtml != null) { nodeHtml.PrependChild(baseNode); } // Headタグがないなら、トップにたさざるをえないだろう else { doc.DocumentNode.PrependChild(baseNode); } } StringWriter writer = new StringWriter(); doc.Save(writer); string newHtml = writer.ToString(); return(newHtml); } catch (Exception) { } return(html); }
public override void Parse(Response response) { //Create a new HTMLAglityPack document HtmlDocument ContentDocument = new HtmlDocument(); //load the #content of the page into the document ContentDocument.LoadHtml(response.Css("#content").First().OuterHtml); HtmlAgilityPack.HtmlNode BodyNode = ContentDocument.DocumentNode; patternObject.Title = BodyNode.SelectSingleNode("//*[@id=\"firstHeading\"]").InnerHtml; HtmlAgilityPack.HtmlNode ContentNode = BodyNode.SelectSingleNode("//*[@id=\"mw-content-text\"]"); //remove the "toc" and "jump" and "siteSub" sections to save space and later client-side processing time if (ContentNode.SelectSingleNode("//*[@id=\"toc\"]") != null) { ContentNode.SelectSingleNode("//*[@id=\"toc\"]").Remove(); } foreach (var node in ContentNode.SelectNodes("//comment()")) { node.Remove(); } ContentNode.PrependChild(BodyNode.SelectSingleNode("//*[@id=\"firstHeading\"]")); //set the patternObject's title patternObject.Title = ContentNode.SelectSingleNode("//*[@id=\"firstHeading\"]").InnerHtml; foreach (var link in ContentNode.SelectNodes("//a/@href")) { //skip if this is a redlink (page doesn't exist). if (link.Attributes["href"].Value.Contains("redlink=1")) { continue; } //skip if this links to this page if (link.Attributes["href"].Value.Split('#').First() == response.FinalUrl) { continue; } //if any of the links ancestor nodes is the "category links" part of the page if (link.Ancestors().Any(node => node.Id == "catlinks")) { if (link.InnerText != "Categories") //if it is not the "categories" special page { //add it to the patterns list of categories patternObject.Categories.Add(link.InnerText); } } else //assume its a normal text-body link { //check if we don't already know about this link patternObject.CreateOrGetPatternLink(link.InnerText); } //add relation info if this is a relation link if (GetNodeReleventPageHeading(link, "h2") != null && GetNodeReleventPageHeading(link, "h2").InnerText == "Relations") { //get the relation type of this relation and get its inner text HtmlAgilityPack.HtmlNode RelationHeadingNode = GetNodeReleventPageHeading(link, "h3"); String RelationName = RelationHeadingNode.InnerText; //if there is a h4 node before the previous h3 node if (GetNodeReleventPageHeading(link, "h4") != null && RelationHeadingNode.InnerStartIndex < GetNodeReleventPageHeading(link, "h4").InnerStartIndex) { //assume it is a "with x" sub-category of relation for the "Can Instantiate" section RelationName = RelationHeadingNode.InnerText + " " + GetNodeReleventPageHeading(link, "h4").InnerText; } //add the relevent relation to this link patternObject.CreateOrGetPatternLink(link.InnerText).Type.Add(RelationName); } } //get a cleaned copy of the #content HTML for giving in the JSON data patternObject.Content = ProcessPageContentToString(ContentNode); string Json = JsonConvert.SerializeObject(patternObject); File.WriteAllText(Pattern.GetFileName(patternObject.Title), Json); }