void ParsePage(ref CardsStream stream, HtmlDocument document, string link, string userLabels) { var examples = document.DocumentNode.SelectNodes("//div[@class='EXAMPLES']"); if (examples == null) { reportStream.Write("Failure. Examples was not found. Link: " + link + "\n"); return; } var gbrTranscription = document.DocumentNode.SelectSingleNode("//span[@class='PRON']"); var word = getText(document.DocumentNode.SelectSingleNode("//span[@class='BASE']")); var wordLabel = word.Replace(' ', '-'); userLabels = (userLabels == "") ? "macmillan " + wordLabel : "macmillan " + wordLabel + " " + userLabels; foreach (HtmlNode example in examples) { var card = new Card(); var parentNode = example.ParentNode; //Getting a structure var structure = example.SelectSingleNode("strong"); if (structure == null) { //DIV#SENSE_BODY -> DIV -> LI structure = parentNode.ParentNode.ParentNode.SelectSingleNode("div/h2/span[@class='BASE']"); } card.Sentence = getText(example.SelectSingleNode("p[@id='EXAMPLE']")); card.Definition = getText(parentNode.SelectSingleNode("span[@class='DEFINITION']")); card.GbrTranscription = getText(gbrTranscription).Replace("/", ""); card.SimpleStructure = getText(structure).Replace(":", ""); if (card.Definition == "") { reportStream.Write("Failure. Definition was not found. Link: " + link + " Example: '" + card.Sentence + "'\n"); } if (card.Sentence == "") { reportStream.Write("Failure. Example was not found. Link: " + link + "\n"); } var outStr = card.Sentence + "\t" + card.Interpretation + "\t" + word + "\t" + card.GbrTranscription + "\t" + card.UsaTranscription + "\t" + card.SimpleStructure + "\t" + card.Definition + "\t" + userLabels + "\n"; stream.Write(outStr); count_++; } }
void ParsePage(ref CardsStream stream, HtmlDocument document, string link, string userLabels) { var examples = document.DocumentNode.SelectNodes("//span[@class='x-g']"); if (examples == null) { reportStream.Write("Failure. Examples was not found. Link: " + link + "\n"); return; } //Transcription var usaTranscription = document.DocumentNode.SelectSingleNode("//span[@class='y']"); var gbrTranscription = document.DocumentNode.SelectSingleNode("//span[@class='i']"); //Word "name" var word = getText(document.DocumentNode.SelectSingleNode("//h2[@class='h']")); //Label var wordLabel = word.Replace(' ', '-'); userLabels = (userLabels == "") ? "oald8 " + wordLabel : "oald8 " + wordLabel + " " + userLabels; foreach (HtmlNode example in examples) { var card = new Card(); var parentNode = example.ParentNode; //Getting a structure var structure1 = example.SelectSingleNode("span[@class='cf']"); if (GetName(parentNode) == "pv-g") { var structure2 = parentNode.SelectSingleNode("h4[@class='pv']"); if (structure2 != null) card.Structure.Add(structure2.InnerText); if (structure1 != null) card.Structure.Add(structure1.InnerText); } else if (GetName(parentNode) == "n-g" && GetName(parentNode.ParentNode) == "pv-g") { var structure2 = parentNode.SelectSingleNode("span[@class='vs-g']"); var structure3 = parentNode.ParentNode.SelectSingleNode("h4[@class='pv']"); if (structure3 != null) card.Structure.Add(structure3.InnerText); if (structure2 != null) card.Structure.Add(structure2.InnerText); if (structure1 != null) card.Structure.Add(structure1.InnerText); } else if (GetName(parentNode) == "n-g") { var structure2 = parentNode.SelectSingleNode("span[@class='cf']"); if (structure2 != null) card.Structure.Add(structure2.InnerText); if (structure1 != null) card.Structure.Add(structure1.InnerText); } else if (GetName(parentNode) == "id-g") { var structure2 = parentNode.SelectSingleNode("h4[@class='id']"); if (structure2 != null) card.Structure.Add(structure2.InnerText); if (structure1 != null) card.Structure.Add(structure1.InnerText); } else if (GetName(parentNode) == "h-g") { var structure2 = parentNode.SelectSingleNode("span[@class='cf']"); if (structure2 != null) card.Structure.Add(structure2.InnerText); if (structure1 != null) card.Structure.Add(structure1.InnerText); } //Getting a definition HtmlNode definition = null; if (GetName(parentNode) == "id-g" || GetName(parentNode) == "h-g") { definition = parentNode.SelectSingleNode("div[@class='def_block']"); } if (definition == null) { definition = parentNode; } var temp = definition.SelectSingleNode("span[@class='ud']"); if (temp == null) { definition = definition.SelectSingleNode("span[@class='d']"); } else { definition = temp; } //An example itself card.Interpretation = getText(example.SelectSingleNode("span[@class='x']")); card.Sentence = (new Regex(" \\(=.*?\\)")).Replace(card.Interpretation, ""); card.Definition = getText(definition); card.Definition = card.Definition.Replace(" ", " "); card.UsaTranscription = getText(usaTranscription); card.GbrTranscription = getText(gbrTranscription); if (card.Interpretation == card.Sentence) { card.Interpretation = ""; } if (card.Definition == "") { reportStream.Write("Failure. Definition was not found. Link: " + link + " Example: '" + card.Sentence + "'\n"); } if (card.Sentence == "") { reportStream.Write("Failure. Example was not found. Link: " + link + "\n"); } var outStr = card.Sentence + "\t" + card.Interpretation + "\t" + word + "\t" + card.GbrTranscription + "\t" + card.UsaTranscription + "\t" + PrintList(card.Structure) + "\t" + card.Definition + "\t" + userLabels + "\n"; stream.Write(outStr); count_++; } }