public void ProcessWordlist(ref CardsStream stream, List<string> wordlist, string labels, int limit) { labels = (labels == "") ? "lingvo " + labels : "lingvo " + labels + " "; foreach (var word in wordlist) { var offset = 0; reportStream.Write("Success. Page was parsed. Word: " + word + "\n"); Console.WriteLine("{0}", word); while (offset < limit) { var page = new HtmlDocument(); page.LoadHtml(DownloadExamples(word, offset)); var examples = page.DocumentNode.SelectNodes("//tr[@class='item first']"); if (examples == null) { reportStream.Write("Failure. Examples was not found. Word: " + word + "\n"); break; } stream.Write(Parse(examples, labels + word)); count_ += examples.Count; examples = page.DocumentNode.SelectNodes("//tr[@class='item']"); if (examples == null) { break; } stream.Write(Parse(examples, labels + word)); count_ += examples.Count; offset += 5; } } reportStream.Write("Total: " + count_ + "\n"); reportStream.Save(); return; }
public void ProcessWordlist(ref CardsStream stream, List <string> wordlist, string labels, int limit) { labels = (labels == "") ? "lingvo " + labels : "lingvo " + labels + " "; foreach (var word in wordlist) { var offset = 0; reportStream.Write("Success. Page was parsed. Word: " + word + "\n"); Console.WriteLine("{0}", word); while (offset < limit) { var page = new HtmlDocument(); page.LoadHtml(DownloadExamples(word, offset)); var examples = page.DocumentNode.SelectNodes("//tr[@class='item first']"); if (examples == null) { reportStream.Write("Failure. Examples was not found. Word: " + word + "\n"); break; } stream.Write(Parse(examples, labels + word)); count_ += examples.Count; examples = page.DocumentNode.SelectNodes("//tr[@class='item']"); if (examples == null) { break; } stream.Write(Parse(examples, labels + word)); count_ += examples.Count; offset += 5; } } reportStream.Write("Total: " + count_ + "\n"); reportStream.Save(); return; }
void ParsePage(ref CardsStream stream, HtmlDocument document, string link, string userLabels) { var examples = document.DocumentNode.SelectNodes("//div[@class='EXAMPLES']"); if (examples == null) { reportStream.Write("Failure. Examples was not found. Link: " + link + "\n"); return; } var gbrTranscription = document.DocumentNode.SelectSingleNode("//span[@class='PRON']"); var word = getText(document.DocumentNode.SelectSingleNode("//span[@class='BASE']")); var wordLabel = word.Replace(' ', '-'); userLabels = (userLabels == "") ? "macmillan " + wordLabel : "macmillan " + wordLabel + " " + userLabels; foreach (HtmlNode example in examples) { var card = new Card(); var parentNode = example.ParentNode; //Getting a structure var structure = example.SelectSingleNode("strong"); if (structure == null) { //DIV#SENSE_BODY -> DIV -> LI structure = parentNode.ParentNode.ParentNode.SelectSingleNode("div/h2/span[@class='BASE']"); } card.Sentence = getText(example.SelectSingleNode("p[@id='EXAMPLE']")); card.Definition = getText(parentNode.SelectSingleNode("span[@class='DEFINITION']")); card.GbrTranscription = getText(gbrTranscription).Replace("/", ""); card.SimpleStructure = getText(structure).Replace(":", ""); if (card.Definition == "") { reportStream.Write("Failure. Definition was not found. Link: " + link + " Example: '" + card.Sentence + "'\n"); } if (card.Sentence == "") { reportStream.Write("Failure. Example was not found. Link: " + link + "\n"); } var outStr = card.Sentence + "\t" + card.Interpretation + "\t" + word + "\t" + card.GbrTranscription + "\t" + card.UsaTranscription + "\t" + card.SimpleStructure + "\t" + card.Definition + "\t" + userLabels + "\n"; stream.Write(outStr); count_++; } }
void ParsePage(ref CardsStream stream, HtmlDocument document, string link, string userLabels) { var examples = document.DocumentNode.SelectNodes("//span[@class='x-g']"); if (examples == null) { reportStream.Write("Failure. Examples was not found. Link: " + link + "\n"); return; } //Transcription var usaTranscription = document.DocumentNode.SelectSingleNode("//span[@class='y']"); var gbrTranscription = document.DocumentNode.SelectSingleNode("//span[@class='i']"); //Word "name" var word = getText(document.DocumentNode.SelectSingleNode("//h2[@class='h']")); //Label var wordLabel = word.Replace(' ', '-'); userLabels = (userLabels == "") ? "oald8 " + wordLabel : "oald8 " + wordLabel + " " + userLabels; foreach (HtmlNode example in examples) { var card = new Card(); var parentNode = example.ParentNode; //Getting a structure var structure1 = example.SelectSingleNode("span[@class='cf']"); if (GetName(parentNode) == "pv-g") { var structure2 = parentNode.SelectSingleNode("h4[@class='pv']"); if (structure2 != null) { card.Structure.Add(structure2.InnerText); } if (structure1 != null) { card.Structure.Add(structure1.InnerText); } } else if (GetName(parentNode) == "n-g" && GetName(parentNode.ParentNode) == "pv-g") { var structure2 = parentNode.SelectSingleNode("span[@class='vs-g']"); var structure3 = parentNode.ParentNode.SelectSingleNode("h4[@class='pv']"); if (structure3 != null) { card.Structure.Add(structure3.InnerText); } if (structure2 != null) { card.Structure.Add(structure2.InnerText); } if (structure1 != null) { card.Structure.Add(structure1.InnerText); } } else if (GetName(parentNode) == "n-g") { var structure2 = parentNode.SelectSingleNode("span[@class='cf']"); if (structure2 != null) { card.Structure.Add(structure2.InnerText); } if (structure1 != null) { card.Structure.Add(structure1.InnerText); } } else if (GetName(parentNode) == "id-g") { var structure2 = parentNode.SelectSingleNode("h4[@class='id']"); if (structure2 != null) { card.Structure.Add(structure2.InnerText); } if (structure1 != null) { card.Structure.Add(structure1.InnerText); } } else if (GetName(parentNode) == "h-g") { var structure2 = parentNode.SelectSingleNode("span[@class='cf']"); if (structure2 != null) { card.Structure.Add(structure2.InnerText); } if (structure1 != null) { card.Structure.Add(structure1.InnerText); } } //Getting a definition HtmlNode definition = null; if (GetName(parentNode) == "id-g" || GetName(parentNode) == "h-g") { definition = parentNode.SelectSingleNode("div[@class='def_block']"); } if (definition == null) { definition = parentNode; } var temp = definition.SelectSingleNode("span[@class='ud']"); if (temp == null) { definition = definition.SelectSingleNode("span[@class='d']"); } else { definition = temp; } //An example itself card.Interpretation = getText(example.SelectSingleNode("span[@class='x']")); card.Sentence = (new Regex(" \\(=.*?\\)")).Replace(card.Interpretation, ""); card.Definition = getText(definition); card.Definition = card.Definition.Replace(" ", " "); card.UsaTranscription = getText(usaTranscription); card.GbrTranscription = getText(gbrTranscription); if (card.Interpretation == card.Sentence) { card.Interpretation = ""; } if (card.Definition == "") { reportStream.Write("Failure. Definition was not found. Link: " + link + " Example: '" + card.Sentence + "'\n"); } if (card.Sentence == "") { reportStream.Write("Failure. Example was not found. Link: " + link + "\n"); } var outStr = card.Sentence + "\t" + card.Interpretation + "\t" + word + "\t" + card.GbrTranscription + "\t" + card.UsaTranscription + "\t" + PrintList(card.Structure) + "\t" + card.Definition + "\t" + userLabels + "\n"; stream.Write(outStr); count_++; } }
public void ProcessWordlist(ref CardsStream stream, List<string> wordlist, string userLabels, string domain, int limit) { var step_ = (limit < 48) ? limit : 48; userLabels = "vocabulary_com " + userLabels; foreach (String word in wordlist) { var offset = 0; var step = step_; var wordLabel = word.Replace(' ', '-'); var primaryDefinitions = String.Empty; var fullDefinitions = String.Empty; var page = (new HtmlWeb()).Load(definitionsPath + word).DocumentNode; Console.WriteLine("{0}", word); //Primary var definitionsNodes = page.SelectNodes("//div[@class='def']"); if (definitionsNodes != null) { foreach (var p in definitionsNodes) { var href = "#s" + (new Regex(@"quickDef(\d+)")).Replace(p.Attributes["id"].Value, "$1"); primaryDefinitions += "<i><b>" + getText(page.SelectSingleNode("//a[@href='" + href +"']")) + "</i></b> " + p.InnerText + "<br/>"; } } definitionsNodes = page.SelectNodes("//div[@class='def selected']"); if (definitionsNodes != null) { foreach (var p in definitionsNodes) { var href = "#s" + (new Regex(@"quickDef(\d+)")).Replace(p.Attributes["id"].Value, "$1"); primaryDefinitions += "<i><b>" + getText(page.SelectSingleNode("//a[@href='" + href + "']")) + "</i></b> " + p.InnerText + "<br/>"; } } if (primaryDefinitions.Length == 0) { definitionsNodes = page.SelectNodes("//h3[@class='definition']"); if (definitionsNodes != null) { foreach (var f in definitionsNodes) { var partOfSpeech = f.SelectSingleNode("a"); var currentDefinition = new Regex("[\t\r\n]").Replace(getText(f), ""); currentDefinition = (new Regex(@"^\w+\s+(.*?)$")).Replace(currentDefinition, "$1"); primaryDefinitions += "<i><b>" + getText(partOfSpeech) + "</i></b> " + currentDefinition + "<br/>"; } primaryDefinitions = primaryDefinitions.Substring(0, primaryDefinitions.Length - 5); } } do { if (limit - offset < step) { step = limit - offset; } if (step <= 0) { break; } try { var json_ = System.Text.UTF8Encoding.ASCII.GetString(client.DownloadData(examplesPath + word + "&maxResults=" + step + "&startOffset=" + offset + "&filter=0&domain=" + domain)); json_ = (new Regex(@"\$d\((.*?)\)")).Replace(json_, @"""$1"""); var json = JObject.Parse(json_); if (offset == 0) { var hits = (int)json.SelectToken("result.totalHits"); reportStream.Write("Success. Word: " + word + " Hits: " + hits + "\n"); Console.WriteLine("Hits: {0}", hits); if (hits == 0) { reportStream.Write("Failure. Examples was not found. Word: " + word + "\n"); } } Console.WriteLine("Processed: {0}", offset); offset += step; foreach (var _ in json.SelectToken("result.sentences")) { count_++; var example = new StringBuilder() .Append(SafeTrim((string)_.SelectToken("sentence"))) .Append("%%!!%%") .Append(SafeTrim((string)_.SelectToken("sentence"))) .Append("%%!!%%") .Append(word) .Append("%%!!%%") .Append(primaryDefinitions) .Append("%%!!%%") //.Append(fullDefinitions) //.Append("%%!!%%") .Append(wordLabel) .Append(" ") .Append(userLabels); stream.Write((new Regex("[\t\n\r]").Replace(example.ToString(), "")).Replace("%%!!%%", "\t") + "\n"); } } catch { continue; } } while (step > 0); Console.WriteLine("Processed: {0}\n", offset); } reportStream.Write("Total: " + count_ + "\n"); reportStream.Save(); }
public void ProcessWordlist(ref CardsStream stream, List <string> wordlist, string labels, bool crossreferenceFlag) { //Разбираем исходные страницы. Возможно, удастся ускорить работу засчёт распараллеливания. //Собираем ссылки в теле статьи и добавляем в список. //downloadPages(ref stream, GetArticleCrossrefenceLinkList(), labels); //Работаем с ссылками, находящиеся в блоке Search Results, в т.ч. и на само word) var crossreferenceLinks = new Hashtable(); foreach (String word in wordlist) { var page = (new HtmlWeb()).Load(searchPath + word); ParsePage(ref stream, page, word, labels); Console.WriteLine("{0}", word); reportStream.Write("Success. Page was parsed. Link: " + word + "\n"); //Берём все слова из блока search results var linksNodes = page.DocumentNode.SelectNodes("//div[@class='entrylist']/ul/li/a"); if (linksNodes != null) { foreach (HtmlNode link in linksNodes) { var url = getCleanUrl(link); if (crossreferenceLinks.ContainsKey(url) == false) { crossreferenceLinks.Add(url, false); } } } var updatedWordList = new Hashtable(); foreach (string link in crossreferenceLinks.Keys) { if ((bool)crossreferenceLinks[link] == true) { continue; } var tranformedWordRegExp = new Regex("^(" + word.Replace(' ', '-') + "_\\d+)$", RegexOptions.IgnoreCase); if (crossreferenceFlag == true || tranformedWordRegExp.Match(link).Success) { ParsePage(ref stream, (new HtmlWeb()).Load(searchPath + link), link, labels); Console.WriteLine("{0}", link); updatedWordList.Add(link, true); reportStream.Write("Success. Page was parsed. Link: " + link + "\n"); } } foreach (DictionaryEntry update in updatedWordList) { crossreferenceLinks[update.Key] = update.Value; } } reportStream.Write("Total: " + count_ + "\n"); reportStream.Save(); }
public void ProcessWordlist(ref CardsStream stream, List <string> wordlist, string userLabels, string domain, int limit) { var step_ = (limit < 48) ? limit : 48; userLabels = "vocabulary_com " + userLabels; foreach (String word in wordlist) { var offset = 0; var step = step_; var wordLabel = word.Replace(' ', '-'); var primaryDefinitions = String.Empty; var fullDefinitions = String.Empty; var page = (new HtmlWeb()).Load(definitionsPath + word).DocumentNode; Console.WriteLine("{0}", word); //Primary var definitionsNodes = page.SelectNodes("//div[@class='def']"); if (definitionsNodes != null) { foreach (var p in definitionsNodes) { var href = "#s" + (new Regex(@"quickDef(\d+)")).Replace(p.Attributes["id"].Value, "$1"); primaryDefinitions += "<i><b>" + getText(page.SelectSingleNode("//a[@href='" + href + "']")) + "</i></b> " + p.InnerText + "<br/>"; } } definitionsNodes = page.SelectNodes("//div[@class='def selected']"); if (definitionsNodes != null) { foreach (var p in definitionsNodes) { var href = "#s" + (new Regex(@"quickDef(\d+)")).Replace(p.Attributes["id"].Value, "$1"); primaryDefinitions += "<i><b>" + getText(page.SelectSingleNode("//a[@href='" + href + "']")) + "</i></b> " + p.InnerText + "<br/>"; } } if (primaryDefinitions.Length == 0) { definitionsNodes = page.SelectNodes("//h3[@class='definition']"); if (definitionsNodes != null) { foreach (var f in definitionsNodes) { var partOfSpeech = f.SelectSingleNode("a"); var currentDefinition = new Regex("[\t\r\n]").Replace(getText(f), ""); currentDefinition = (new Regex(@"^\w+\s+(.*?)$")).Replace(currentDefinition, "$1"); primaryDefinitions += "<i><b>" + getText(partOfSpeech) + "</i></b> " + currentDefinition + "<br/>"; } primaryDefinitions = primaryDefinitions.Substring(0, primaryDefinitions.Length - 5); } } do { if (limit - offset < step) { step = limit - offset; } if (step <= 0) { break; } try { var json_ = System.Text.UTF8Encoding.ASCII.GetString(client.DownloadData(examplesPath + word + "&maxResults=" + step + "&startOffset=" + offset + "&filter=0&domain=" + domain)); json_ = (new Regex(@"\$d\((.*?)\)")).Replace(json_, @"""$1"""); var json = JObject.Parse(json_); if (offset == 0) { var hits = (int)json.SelectToken("result.totalHits"); reportStream.Write("Success. Word: " + word + " Hits: " + hits + "\n"); Console.WriteLine("Hits: {0}", hits); if (hits == 0) { reportStream.Write("Failure. Examples was not found. Word: " + word + "\n"); } } Console.WriteLine("Processed: {0}", offset); offset += step; foreach (var _ in json.SelectToken("result.sentences")) { count_++; var example = new StringBuilder() .Append(SafeTrim((string)_.SelectToken("sentence"))) .Append("%%!!%%") .Append(SafeTrim((string)_.SelectToken("sentence"))) .Append("%%!!%%") .Append(word) .Append("%%!!%%") .Append(primaryDefinitions) .Append("%%!!%%") //.Append(fullDefinitions) //.Append("%%!!%%") .Append(wordLabel) .Append(" ") .Append(userLabels); stream.Write((new Regex("[\t\n\r]").Replace(example.ToString(), "")).Replace("%%!!%%", "\t") + "\n"); } } catch { continue; } }while (step > 0); Console.WriteLine("Processed: {0}\n", offset); } reportStream.Write("Total: " + count_ + "\n"); reportStream.Save(); }
void ParsePage(ref CardsStream stream, HtmlDocument document, string link, string userLabels) { var examples = document.DocumentNode.SelectNodes("//span[@class='x-g']"); if (examples == null) { reportStream.Write("Failure. Examples was not found. Link: " + link + "\n"); return; } //Transcription var usaTranscription = document.DocumentNode.SelectSingleNode("//span[@class='y']"); var gbrTranscription = document.DocumentNode.SelectSingleNode("//span[@class='i']"); //Word "name" var word = getText(document.DocumentNode.SelectSingleNode("//h2[@class='h']")); //Label var wordLabel = word.Replace(' ', '-'); userLabels = (userLabels == "") ? "oald8 " + wordLabel : "oald8 " + wordLabel + " " + userLabels; foreach (HtmlNode example in examples) { var card = new Card(); var parentNode = example.ParentNode; //Getting a structure var structure1 = example.SelectSingleNode("span[@class='cf']"); if (GetName(parentNode) == "pv-g") { var structure2 = parentNode.SelectSingleNode("h4[@class='pv']"); if (structure2 != null) card.Structure.Add(structure2.InnerText); if (structure1 != null) card.Structure.Add(structure1.InnerText); } else if (GetName(parentNode) == "n-g" && GetName(parentNode.ParentNode) == "pv-g") { var structure2 = parentNode.SelectSingleNode("span[@class='vs-g']"); var structure3 = parentNode.ParentNode.SelectSingleNode("h4[@class='pv']"); if (structure3 != null) card.Structure.Add(structure3.InnerText); if (structure2 != null) card.Structure.Add(structure2.InnerText); if (structure1 != null) card.Structure.Add(structure1.InnerText); } else if (GetName(parentNode) == "n-g") { var structure2 = parentNode.SelectSingleNode("span[@class='cf']"); if (structure2 != null) card.Structure.Add(structure2.InnerText); if (structure1 != null) card.Structure.Add(structure1.InnerText); } else if (GetName(parentNode) == "id-g") { var structure2 = parentNode.SelectSingleNode("h4[@class='id']"); if (structure2 != null) card.Structure.Add(structure2.InnerText); if (structure1 != null) card.Structure.Add(structure1.InnerText); } else if (GetName(parentNode) == "h-g") { var structure2 = parentNode.SelectSingleNode("span[@class='cf']"); if (structure2 != null) card.Structure.Add(structure2.InnerText); if (structure1 != null) card.Structure.Add(structure1.InnerText); } //Getting a definition HtmlNode definition = null; if (GetName(parentNode) == "id-g" || GetName(parentNode) == "h-g") { definition = parentNode.SelectSingleNode("div[@class='def_block']"); } if (definition == null) { definition = parentNode; } var temp = definition.SelectSingleNode("span[@class='ud']"); if (temp == null) { definition = definition.SelectSingleNode("span[@class='d']"); } else { definition = temp; } //An example itself card.Interpretation = getText(example.SelectSingleNode("span[@class='x']")); card.Sentence = (new Regex(" \\(=.*?\\)")).Replace(card.Interpretation, ""); card.Definition = getText(definition); card.Definition = card.Definition.Replace(" ", " "); card.UsaTranscription = getText(usaTranscription); card.GbrTranscription = getText(gbrTranscription); if (card.Interpretation == card.Sentence) { card.Interpretation = ""; } if (card.Definition == "") { reportStream.Write("Failure. Definition was not found. Link: " + link + " Example: '" + card.Sentence + "'\n"); } if (card.Sentence == "") { reportStream.Write("Failure. Example was not found. Link: " + link + "\n"); } var outStr = card.Sentence + "\t" + card.Interpretation + "\t" + word + "\t" + card.GbrTranscription + "\t" + card.UsaTranscription + "\t" + PrintList(card.Structure) + "\t" + card.Definition + "\t" + userLabels + "\n"; stream.Write(outStr); count_++; } }
public void ProcessWordlist(ref CardsStream stream, List <string> wordlist, string labels, bool crossreferenceFlag) { //Разбираем исходные страницы. Возможно, удастся ускорить работу засчёт распараллеливания. //Собираем ссылки в теле статьи и добавляем в список. //downloadPages(ref stream, GetArticleCrossrefenceLinkList(), labels); //Работаем с ссылками, находящиеся в блоке Search Results, в т.ч. и на само word) var crossreferenceLinks = new Hashtable(); string currentPageLink; foreach (String word in wordlist) { var page = (new HtmlWeb()).Load(searchPath + word); //Определяем, на какой по какой ссылке находится наше слово var currentPageLink_ = page.DocumentNode.SelectSingleNode("//li[@class='currentpage']/a"); currentPageLink = (currentPageLink_ != null) ? getCleanUrl(currentPageLink_) : ""; //Берём все слова из блока search results var linksNodes = page.DocumentNode.SelectNodes("//div[@id='relatedentries']/ul/li/a"); if (linksNodes != null) { foreach (HtmlNode link in linksNodes) { var url = getCleanUrl(link); if (crossreferenceLinks.ContainsKey(url) == false) { crossreferenceLinks.Add(url, false); } } } else { reportStream.Write("Failure. Page not found. Link: " + word + "\n"); } //Всё нужное получили, начинаем обрабатывать searchCrossreferenceLinkList var updatedWordList = new Hashtable(); foreach (string link in crossreferenceLinks.Keys) { if ((bool)crossreferenceLinks[link] == true) { continue; } var tranformedWordRegExp = new Regex("^(" + word.Replace(' ', '-') + "_\\d+)$", RegexOptions.IgnoreCase); //Первую страницу мы уже скачали. if (currentPageLink == link) { ParsePage(ref stream, page, word, labels); } else if (tranformedWordRegExp.Match(link).Success) { ParsePage(ref stream, (new HtmlWeb()).Load(searchPath + link), link, labels); } else if (crossreferenceFlag == true) { ParsePage(ref stream, (new HtmlWeb()).Load(searchPath + link), link, labels); } if (currentPageLink == link || tranformedWordRegExp.Match(link).Success || crossreferenceFlag == true) { Console.WriteLine("{0}", link); updatedWordList.Add(link, true); reportStream.Write("Success. Page was parsed. Link: " + link + "\n"); } } foreach (DictionaryEntry update in updatedWordList) { crossreferenceLinks[update.Key] = update.Value; } } reportStream.Write("Total: " + count_ + "\n"); reportStream.Save(); }