public void ProcessWordlist(ref CardsStream stream, List<string> wordlist, string labels, bool crossreferenceFlag) { //Разбираем исходные страницы. Возможно, удастся ускорить работу засчёт распараллеливания. //Собираем ссылки в теле статьи и добавляем в список. //downloadPages(ref stream, GetArticleCrossrefenceLinkList(), labels); //Работаем с ссылками, находящиеся в блоке Search Results, в т.ч. и на само word) var crossreferenceLinks = new Hashtable(); foreach (String word in wordlist) { var page = (new HtmlWeb()).Load(searchPath + word); ParsePage(ref stream, page, word, labels); Console.WriteLine("{0}", word); reportStream.Write("Success. Page was parsed. Link: " + word + "\n"); //Берём все слова из блока search results var linksNodes = page.DocumentNode.SelectNodes("//div[@class='entrylist']/ul/li/a"); if (linksNodes != null) { foreach (HtmlNode link in linksNodes) { var url = getCleanUrl(link); if (crossreferenceLinks.ContainsKey(url) == false) { crossreferenceLinks.Add(url, false); } } } var updatedWordList = new Hashtable(); foreach (string link in crossreferenceLinks.Keys) { if ((bool)crossreferenceLinks[link] == true) { continue; } var tranformedWordRegExp = new Regex("^(" + word.Replace(' ', '-') + "_\\d+)$", RegexOptions.IgnoreCase); if (crossreferenceFlag == true || tranformedWordRegExp.Match(link).Success) { ParsePage(ref stream, (new HtmlWeb()).Load(searchPath + link), link, labels); Console.WriteLine("{0}", link); updatedWordList.Add(link, true); reportStream.Write("Success. Page was parsed. Link: " + link + "\n"); } } foreach (DictionaryEntry update in updatedWordList) { crossreferenceLinks[update.Key] = update.Value; } } reportStream.Write("Total: " + count_ + "\n"); reportStream.Save(); }
void ParsePage(ref CardsStream stream, HtmlDocument document, string link, string userLabels) { var examples = document.DocumentNode.SelectNodes("//div[@class='EXAMPLES']"); if (examples == null) { reportStream.Write("Failure. Examples was not found. Link: " + link + "\n"); return; } var gbrTranscription = document.DocumentNode.SelectSingleNode("//span[@class='PRON']"); var word = getText(document.DocumentNode.SelectSingleNode("//span[@class='BASE']")); var wordLabel = word.Replace(' ', '-'); userLabels = (userLabels == "") ? "macmillan " + wordLabel : "macmillan " + wordLabel + " " + userLabels; foreach (HtmlNode example in examples) { var card = new Card(); var parentNode = example.ParentNode; //Getting a structure var structure = example.SelectSingleNode("strong"); if (structure == null) { //DIV#SENSE_BODY -> DIV -> LI structure = parentNode.ParentNode.ParentNode.SelectSingleNode("div/h2/span[@class='BASE']"); } card.Sentence = getText(example.SelectSingleNode("p[@id='EXAMPLE']")); card.Definition = getText(parentNode.SelectSingleNode("span[@class='DEFINITION']")); card.GbrTranscription = getText(gbrTranscription).Replace("/", ""); card.SimpleStructure = getText(structure).Replace(":", ""); if (card.Definition == "") { reportStream.Write("Failure. Definition was not found. Link: " + link + " Example: '" + card.Sentence + "'\n"); } if (card.Sentence == "") { reportStream.Write("Failure. Example was not found. Link: " + link + "\n"); } var outStr = card.Sentence + "\t" + card.Interpretation + "\t" + word + "\t" + card.GbrTranscription + "\t" + card.UsaTranscription + "\t" + card.SimpleStructure + "\t" + card.Definition + "\t" + userLabels + "\n"; stream.Write(outStr); count_++; } }
public void ProcessWordlist(ref CardsStream stream, List<string> wordlist, string labels, int limit) { labels = (labels == "") ? "lingvo " + labels : "lingvo " + labels + " "; foreach (var word in wordlist) { var offset = 0; reportStream.Write("Success. Page was parsed. Word: " + word + "\n"); Console.WriteLine("{0}", word); while (offset < limit) { var page = new HtmlDocument(); page.LoadHtml(DownloadExamples(word, offset)); var examples = page.DocumentNode.SelectNodes("//tr[@class='item first']"); if (examples == null) { reportStream.Write("Failure. Examples was not found. Word: " + word + "\n"); break; } stream.Write(Parse(examples, labels + word)); count_ += examples.Count; examples = page.DocumentNode.SelectNodes("//tr[@class='item']"); if (examples == null) { break; } stream.Write(Parse(examples, labels + word)); count_ += examples.Count; offset += 5; } } reportStream.Write("Total: " + count_ + "\n"); reportStream.Save(); return; }
public void ProcessWordlist(ref CardsStream stream, List <string> wordlist, string labels, int limit) { labels = (labels == "") ? "lingvo " + labels : "lingvo " + labels + " "; foreach (var word in wordlist) { var offset = 0; reportStream.Write("Success. Page was parsed. Word: " + word + "\n"); Console.WriteLine("{0}", word); while (offset < limit) { var page = new HtmlDocument(); page.LoadHtml(DownloadExamples(word, offset)); var examples = page.DocumentNode.SelectNodes("//tr[@class='item first']"); if (examples == null) { reportStream.Write("Failure. Examples was not found. Word: " + word + "\n"); break; } stream.Write(Parse(examples, labels + word)); count_ += examples.Count; examples = page.DocumentNode.SelectNodes("//tr[@class='item']"); if (examples == null) { break; } stream.Write(Parse(examples, labels + word)); count_ += examples.Count; offset += 5; } } reportStream.Write("Total: " + count_ + "\n"); reportStream.Save(); return; }
void ParsePage(ref CardsStream stream, HtmlDocument document, string link, string userLabels) { var examples = document.DocumentNode.SelectNodes("//span[@class='x-g']"); if (examples == null) { reportStream.Write("Failure. Examples was not found. Link: " + link + "\n"); return; } //Transcription var usaTranscription = document.DocumentNode.SelectSingleNode("//span[@class='y']"); var gbrTranscription = document.DocumentNode.SelectSingleNode("//span[@class='i']"); //Word "name" var word = getText(document.DocumentNode.SelectSingleNode("//h2[@class='h']")); //Label var wordLabel = word.Replace(' ', '-'); userLabels = (userLabels == "") ? "oald8 " + wordLabel : "oald8 " + wordLabel + " " + userLabels; foreach (HtmlNode example in examples) { var card = new Card(); var parentNode = example.ParentNode; //Getting a structure var structure1 = example.SelectSingleNode("span[@class='cf']"); if (GetName(parentNode) == "pv-g") { var structure2 = parentNode.SelectSingleNode("h4[@class='pv']"); if (structure2 != null) { card.Structure.Add(structure2.InnerText); } if (structure1 != null) { card.Structure.Add(structure1.InnerText); } } else if (GetName(parentNode) == "n-g" && GetName(parentNode.ParentNode) == "pv-g") { var structure2 = parentNode.SelectSingleNode("span[@class='vs-g']"); var structure3 = parentNode.ParentNode.SelectSingleNode("h4[@class='pv']"); if (structure3 != null) { card.Structure.Add(structure3.InnerText); } if (structure2 != null) { card.Structure.Add(structure2.InnerText); } if (structure1 != null) { card.Structure.Add(structure1.InnerText); } } else if (GetName(parentNode) == "n-g") { var structure2 = parentNode.SelectSingleNode("span[@class='cf']"); if (structure2 != null) { card.Structure.Add(structure2.InnerText); } if (structure1 != null) { card.Structure.Add(structure1.InnerText); } } else if (GetName(parentNode) == "id-g") { var structure2 = parentNode.SelectSingleNode("h4[@class='id']"); if (structure2 != null) { card.Structure.Add(structure2.InnerText); } if (structure1 != null) { card.Structure.Add(structure1.InnerText); } } else if (GetName(parentNode) == "h-g") { var structure2 = parentNode.SelectSingleNode("span[@class='cf']"); if (structure2 != null) { card.Structure.Add(structure2.InnerText); } if (structure1 != null) { card.Structure.Add(structure1.InnerText); } } //Getting a definition HtmlNode definition = null; if (GetName(parentNode) == "id-g" || GetName(parentNode) == "h-g") { definition = parentNode.SelectSingleNode("div[@class='def_block']"); } if (definition == null) { definition = parentNode; } var temp = definition.SelectSingleNode("span[@class='ud']"); if (temp == null) { definition = definition.SelectSingleNode("span[@class='d']"); } else { definition = temp; } //An example itself card.Interpretation = getText(example.SelectSingleNode("span[@class='x']")); card.Sentence = (new Regex(" \\(=.*?\\)")).Replace(card.Interpretation, ""); card.Definition = getText(definition); card.Definition = card.Definition.Replace(" ", " "); card.UsaTranscription = getText(usaTranscription); card.GbrTranscription = getText(gbrTranscription); if (card.Interpretation == card.Sentence) { card.Interpretation = ""; } if (card.Definition == "") { reportStream.Write("Failure. Definition was not found. Link: " + link + " Example: '" + card.Sentence + "'\n"); } if (card.Sentence == "") { reportStream.Write("Failure. Example was not found. Link: " + link + "\n"); } var outStr = card.Sentence + "\t" + card.Interpretation + "\t" + word + "\t" + card.GbrTranscription + "\t" + card.UsaTranscription + "\t" + PrintList(card.Structure) + "\t" + card.Definition + "\t" + userLabels + "\n"; stream.Write(outStr); count_++; } }
public void ProcessWordlist(ref CardsStream stream, List<string> wordlist, string userLabels, string domain, int limit) { var step_ = (limit < 48) ? limit : 48; userLabels = "vocabulary_com " + userLabels; foreach (String word in wordlist) { var offset = 0; var step = step_; var wordLabel = word.Replace(' ', '-'); var primaryDefinitions = String.Empty; var fullDefinitions = String.Empty; var page = (new HtmlWeb()).Load(definitionsPath + word).DocumentNode; Console.WriteLine("{0}", word); //Primary var definitionsNodes = page.SelectNodes("//div[@class='def']"); if (definitionsNodes != null) { foreach (var p in definitionsNodes) { var href = "#s" + (new Regex(@"quickDef(\d+)")).Replace(p.Attributes["id"].Value, "$1"); primaryDefinitions += "<i><b>" + getText(page.SelectSingleNode("//a[@href='" + href +"']")) + "</i></b> " + p.InnerText + "<br/>"; } } definitionsNodes = page.SelectNodes("//div[@class='def selected']"); if (definitionsNodes != null) { foreach (var p in definitionsNodes) { var href = "#s" + (new Regex(@"quickDef(\d+)")).Replace(p.Attributes["id"].Value, "$1"); primaryDefinitions += "<i><b>" + getText(page.SelectSingleNode("//a[@href='" + href + "']")) + "</i></b> " + p.InnerText + "<br/>"; } } if (primaryDefinitions.Length == 0) { definitionsNodes = page.SelectNodes("//h3[@class='definition']"); if (definitionsNodes != null) { foreach (var f in definitionsNodes) { var partOfSpeech = f.SelectSingleNode("a"); var currentDefinition = new Regex("[\t\r\n]").Replace(getText(f), ""); currentDefinition = (new Regex(@"^\w+\s+(.*?)$")).Replace(currentDefinition, "$1"); primaryDefinitions += "<i><b>" + getText(partOfSpeech) + "</i></b> " + currentDefinition + "<br/>"; } primaryDefinitions = primaryDefinitions.Substring(0, primaryDefinitions.Length - 5); } } do { if (limit - offset < step) { step = limit - offset; } if (step <= 0) { break; } try { var json_ = System.Text.UTF8Encoding.ASCII.GetString(client.DownloadData(examplesPath + word + "&maxResults=" + step + "&startOffset=" + offset + "&filter=0&domain=" + domain)); json_ = (new Regex(@"\$d\((.*?)\)")).Replace(json_, @"""$1"""); var json = JObject.Parse(json_); if (offset == 0) { var hits = (int)json.SelectToken("result.totalHits"); reportStream.Write("Success. Word: " + word + " Hits: " + hits + "\n"); Console.WriteLine("Hits: {0}", hits); if (hits == 0) { reportStream.Write("Failure. Examples was not found. Word: " + word + "\n"); } } Console.WriteLine("Processed: {0}", offset); offset += step; foreach (var _ in json.SelectToken("result.sentences")) { count_++; var example = new StringBuilder() .Append(SafeTrim((string)_.SelectToken("sentence"))) .Append("%%!!%%") .Append(SafeTrim((string)_.SelectToken("sentence"))) .Append("%%!!%%") .Append(word) .Append("%%!!%%") .Append(primaryDefinitions) .Append("%%!!%%") //.Append(fullDefinitions) //.Append("%%!!%%") .Append(wordLabel) .Append(" ") .Append(userLabels); stream.Write((new Regex("[\t\n\r]").Replace(example.ToString(), "")).Replace("%%!!%%", "\t") + "\n"); } } catch { continue; } } while (step > 0); Console.WriteLine("Processed: {0}\n", offset); } reportStream.Write("Total: " + count_ + "\n"); reportStream.Save(); }
public void ProcessWordlist(ref CardsStream stream, List <string> wordlist, string labels, bool crossreferenceFlag) { //Разбираем исходные страницы. Возможно, удастся ускорить работу засчёт распараллеливания. //Собираем ссылки в теле статьи и добавляем в список. //downloadPages(ref stream, GetArticleCrossrefenceLinkList(), labels); //Работаем с ссылками, находящиеся в блоке Search Results, в т.ч. и на само word) var crossreferenceLinks = new Hashtable(); foreach (String word in wordlist) { var page = (new HtmlWeb()).Load(searchPath + word); ParsePage(ref stream, page, word, labels); Console.WriteLine("{0}", word); reportStream.Write("Success. Page was parsed. Link: " + word + "\n"); //Берём все слова из блока search results var linksNodes = page.DocumentNode.SelectNodes("//div[@class='entrylist']/ul/li/a"); if (linksNodes != null) { foreach (HtmlNode link in linksNodes) { var url = getCleanUrl(link); if (crossreferenceLinks.ContainsKey(url) == false) { crossreferenceLinks.Add(url, false); } } } var updatedWordList = new Hashtable(); foreach (string link in crossreferenceLinks.Keys) { if ((bool)crossreferenceLinks[link] == true) { continue; } var tranformedWordRegExp = new Regex("^(" + word.Replace(' ', '-') + "_\\d+)$", RegexOptions.IgnoreCase); if (crossreferenceFlag == true || tranformedWordRegExp.Match(link).Success) { ParsePage(ref stream, (new HtmlWeb()).Load(searchPath + link), link, labels); Console.WriteLine("{0}", link); updatedWordList.Add(link, true); reportStream.Write("Success. Page was parsed. Link: " + link + "\n"); } } foreach (DictionaryEntry update in updatedWordList) { crossreferenceLinks[update.Key] = update.Value; } } reportStream.Write("Total: " + count_ + "\n"); reportStream.Save(); }
public static void Main(string[] args) { ParserMask useParser = ParserMask.None; var inputPath = ""; var outputPath = String.Empty; var wordlist = new List <String>(); var labels = ""; var relatedFlag = false; var examplesLimit = Int32.MaxValue; var vocabularyComDomain = String.Empty; CardsStream output; StreamReader input; for (int i = 0; i < args.Length; i++) { if (args[i] == "-oald8") { outputPath = "oald8"; useParser |= ParserMask.Oald8; } else if (args[i] == "-macmillan") { outputPath = "macmillan"; useParser |= ParserMask.Macmillan; } else if (args[i] == "-vocabcom") { outputPath = "vocabulary.com"; useParser |= ParserMask.VocabularyCom; } else if (args[i] == "-lingvo") { outputPath = "lingvo"; useParser |= ParserMask.LingvoRu; } else if (args[i] == "-l" && i + 1 < args.Length) { labels = (args[i + 1]).Trim(); i++; } else if (args[i] == "-limit" && i + 1 < args.Length) { examplesLimit = Convert.ToInt32((args[i + 1]).Trim()); i++; } else if (args[i] == "-vdomain" && i + 1 < args.Length) { vocabularyComDomain = (args[i + 1]).Trim(); i++; } else if (args[i] == "-p" && i + 1 < args.Length) { inputPath = (args[i + 1]).Trim(); i++; } else if (args[i] == "-related") { relatedFlag = true; } } if (inputPath == "") { Console.Write("Where's a listname, uh? You should enter -p <path_to_you_wordlist>"); Console.ReadKey(); return; } else { try { input = new StreamReader(inputPath); while (input.EndOfStream == false) { // var word = (new Regex("[^- 0-9a-zA-Z']+")).Replace(input.ReadLine(), "").Trim(); // if (wordlist.Contains(word) == false) { wordlist.Add(word); } } input.Close(); } catch (FileNotFoundException e) { Console.Write("Wrong path: {0}", inputPath); } catch (DirectoryNotFoundException e) { Console.Write("Wrong directory: {0}", inputPath); } } if (labels.Length != 0) { outputPath = "./" + outputPath + " " + labels; } else { outputPath = "./" + outputPath; } outputPath += " " + DateTime.Now.ToString("yyyy.MM.dd HH-mm-ss") + ".txt"; output = new CardsStream(outputPath, 100000); if ((useParser & ParserMask.Oald8) != ParserMask.None) { var parser = new Oald8(); parser.ProcessWordlist(ref output, wordlist, labels, relatedFlag); Console.WriteLine("\nCount: {0}\n", parser.count); } else if ((useParser & ParserMask.Macmillan) != ParserMask.None) { var parser = new Macmillan(); parser.ProcessWordlist(ref output, wordlist, labels, relatedFlag); Console.WriteLine("\nCount: {0}\n", parser.count); } else if ((useParser & ParserMask.VocabularyCom) != ParserMask.None) { var parser = new VocabularyCom(); parser.ProcessWordlist(ref output, wordlist, labels, vocabularyComDomain, examplesLimit); Console.WriteLine("\nCount: {0}\n", parser.count); } else if ((useParser & ParserMask.LingvoRu) != ParserMask.None) { var parser = new LingvoRu(); parser.ProcessWordlist(ref output, wordlist, labels, examplesLimit); Console.WriteLine("\nCount: {0}\n", parser.count); } if (useParser != ParserMask.None) { output.Save(); } }
public static void Main(string[] args) { ParserMask useParser = ParserMask.None; var inputPath = ""; var outputPath = String.Empty; var wordlist = new List<String>(); var labels = ""; var relatedFlag = false; var examplesLimit = Int32.MaxValue; var vocabularyComDomain = String.Empty; CardsStream output; StreamReader input; for (int i = 0; i < args.Length; i++) { if (args[i] == "-oald8") { outputPath = "oald8"; useParser |= ParserMask.Oald8; } else if (args[i] == "-macmillan") { outputPath = "macmillan"; useParser |= ParserMask.Macmillan; } else if (args[i] == "-vocabcom") { outputPath = "vocabulary.com"; useParser |= ParserMask.VocabularyCom; } else if (args[i] == "-lingvo") { outputPath = "lingvo"; useParser |= ParserMask.LingvoRu; } else if (args[i] == "-l" && i + 1 < args.Length) { labels = (args[i + 1]).Trim(); i++; } else if (args[i] == "-limit" && i + 1 < args.Length) { examplesLimit = Convert.ToInt32((args[i + 1]).Trim()); i++; } else if (args[i] == "-vdomain" && i + 1 < args.Length) { vocabularyComDomain = (args[i + 1]).Trim(); i++; } else if (args[i] == "-p" && i + 1 < args.Length) { inputPath = (args[i + 1]).Trim(); i++; } else if (args[i] == "-related") { relatedFlag = true; } } if (inputPath == "") { Console.Write("Where's a listname, uh? You should enter -p <path_to_you_wordlist>"); Console.ReadKey(); return; } else { try { input = new StreamReader(inputPath); while (input.EndOfStream == false) { var word = (new Regex("[^- 0-9a-zA-Z']+")).Replace(input.ReadLine(), "").Trim(); if (wordlist.Contains(word) == false) { wordlist.Add(word); } } input.Close(); } catch (FileNotFoundException e) { Console.Write("Wrong path: {0}", inputPath); } catch (DirectoryNotFoundException e) { Console.Write("Wrong directory: {0}", inputPath); } } if (labels.Length != 0) { outputPath = "./" + outputPath + " " + labels; } else { outputPath = "./" + outputPath; } outputPath += " " + DateTime.Now.ToString("yyyy.MM.dd HH-mm-ss") + ".txt"; output = new CardsStream(outputPath, 100000); if ((useParser & ParserMask.Oald8) != ParserMask.None) { var parser = new Oald8(); parser.ProcessWordlist(ref output, wordlist, labels, relatedFlag); Console.WriteLine("\nCount: {0}\n", parser.count); } else if ((useParser & ParserMask.Macmillan) != ParserMask.None) { var parser = new Macmillan(); parser.ProcessWordlist(ref output, wordlist, labels, relatedFlag); Console.WriteLine("\nCount: {0}\n", parser.count); } else if ((useParser & ParserMask.VocabularyCom) != ParserMask.None) { var parser = new VocabularyCom(); parser.ProcessWordlist(ref output, wordlist, labels, vocabularyComDomain, examplesLimit); Console.WriteLine("\nCount: {0}\n", parser.count); } else if ((useParser & ParserMask.LingvoRu) != ParserMask.None) { var parser = new LingvoRu(); parser.ProcessWordlist(ref output, wordlist, labels, examplesLimit); Console.WriteLine("\nCount: {0}\n", parser.count); } if (useParser != ParserMask.None) { output.Save(); } }
public void ProcessWordlist(ref CardsStream stream, List <string> wordlist, string userLabels, string domain, int limit) { var step_ = (limit < 48) ? limit : 48; userLabels = "vocabulary_com " + userLabels; foreach (String word in wordlist) { var offset = 0; var step = step_; var wordLabel = word.Replace(' ', '-'); var primaryDefinitions = String.Empty; var fullDefinitions = String.Empty; var page = (new HtmlWeb()).Load(definitionsPath + word).DocumentNode; Console.WriteLine("{0}", word); //Primary var definitionsNodes = page.SelectNodes("//div[@class='def']"); if (definitionsNodes != null) { foreach (var p in definitionsNodes) { var href = "#s" + (new Regex(@"quickDef(\d+)")).Replace(p.Attributes["id"].Value, "$1"); primaryDefinitions += "<i><b>" + getText(page.SelectSingleNode("//a[@href='" + href + "']")) + "</i></b> " + p.InnerText + "<br/>"; } } definitionsNodes = page.SelectNodes("//div[@class='def selected']"); if (definitionsNodes != null) { foreach (var p in definitionsNodes) { var href = "#s" + (new Regex(@"quickDef(\d+)")).Replace(p.Attributes["id"].Value, "$1"); primaryDefinitions += "<i><b>" + getText(page.SelectSingleNode("//a[@href='" + href + "']")) + "</i></b> " + p.InnerText + "<br/>"; } } if (primaryDefinitions.Length == 0) { definitionsNodes = page.SelectNodes("//h3[@class='definition']"); if (definitionsNodes != null) { foreach (var f in definitionsNodes) { var partOfSpeech = f.SelectSingleNode("a"); var currentDefinition = new Regex("[\t\r\n]").Replace(getText(f), ""); currentDefinition = (new Regex(@"^\w+\s+(.*?)$")).Replace(currentDefinition, "$1"); primaryDefinitions += "<i><b>" + getText(partOfSpeech) + "</i></b> " + currentDefinition + "<br/>"; } primaryDefinitions = primaryDefinitions.Substring(0, primaryDefinitions.Length - 5); } } do { if (limit - offset < step) { step = limit - offset; } if (step <= 0) { break; } try { var json_ = System.Text.UTF8Encoding.ASCII.GetString(client.DownloadData(examplesPath + word + "&maxResults=" + step + "&startOffset=" + offset + "&filter=0&domain=" + domain)); json_ = (new Regex(@"\$d\((.*?)\)")).Replace(json_, @"""$1"""); var json = JObject.Parse(json_); if (offset == 0) { var hits = (int)json.SelectToken("result.totalHits"); reportStream.Write("Success. Word: " + word + " Hits: " + hits + "\n"); Console.WriteLine("Hits: {0}", hits); if (hits == 0) { reportStream.Write("Failure. Examples was not found. Word: " + word + "\n"); } } Console.WriteLine("Processed: {0}", offset); offset += step; foreach (var _ in json.SelectToken("result.sentences")) { count_++; var example = new StringBuilder() .Append(SafeTrim((string)_.SelectToken("sentence"))) .Append("%%!!%%") .Append(SafeTrim((string)_.SelectToken("sentence"))) .Append("%%!!%%") .Append(word) .Append("%%!!%%") .Append(primaryDefinitions) .Append("%%!!%%") //.Append(fullDefinitions) //.Append("%%!!%%") .Append(wordLabel) .Append(" ") .Append(userLabels); stream.Write((new Regex("[\t\n\r]").Replace(example.ToString(), "")).Replace("%%!!%%", "\t") + "\n"); } } catch { continue; } }while (step > 0); Console.WriteLine("Processed: {0}\n", offset); } reportStream.Write("Total: " + count_ + "\n"); reportStream.Save(); }
void ParsePage(ref CardsStream stream, HtmlDocument document, string link, string userLabels) { var examples = document.DocumentNode.SelectNodes("//span[@class='x-g']"); if (examples == null) { reportStream.Write("Failure. Examples was not found. Link: " + link + "\n"); return; } //Transcription var usaTranscription = document.DocumentNode.SelectSingleNode("//span[@class='y']"); var gbrTranscription = document.DocumentNode.SelectSingleNode("//span[@class='i']"); //Word "name" var word = getText(document.DocumentNode.SelectSingleNode("//h2[@class='h']")); //Label var wordLabel = word.Replace(' ', '-'); userLabels = (userLabels == "") ? "oald8 " + wordLabel : "oald8 " + wordLabel + " " + userLabels; foreach (HtmlNode example in examples) { var card = new Card(); var parentNode = example.ParentNode; //Getting a structure var structure1 = example.SelectSingleNode("span[@class='cf']"); if (GetName(parentNode) == "pv-g") { var structure2 = parentNode.SelectSingleNode("h4[@class='pv']"); if (structure2 != null) card.Structure.Add(structure2.InnerText); if (structure1 != null) card.Structure.Add(structure1.InnerText); } else if (GetName(parentNode) == "n-g" && GetName(parentNode.ParentNode) == "pv-g") { var structure2 = parentNode.SelectSingleNode("span[@class='vs-g']"); var structure3 = parentNode.ParentNode.SelectSingleNode("h4[@class='pv']"); if (structure3 != null) card.Structure.Add(structure3.InnerText); if (structure2 != null) card.Structure.Add(structure2.InnerText); if (structure1 != null) card.Structure.Add(structure1.InnerText); } else if (GetName(parentNode) == "n-g") { var structure2 = parentNode.SelectSingleNode("span[@class='cf']"); if (structure2 != null) card.Structure.Add(structure2.InnerText); if (structure1 != null) card.Structure.Add(structure1.InnerText); } else if (GetName(parentNode) == "id-g") { var structure2 = parentNode.SelectSingleNode("h4[@class='id']"); if (structure2 != null) card.Structure.Add(structure2.InnerText); if (structure1 != null) card.Structure.Add(structure1.InnerText); } else if (GetName(parentNode) == "h-g") { var structure2 = parentNode.SelectSingleNode("span[@class='cf']"); if (structure2 != null) card.Structure.Add(structure2.InnerText); if (structure1 != null) card.Structure.Add(structure1.InnerText); } //Getting a definition HtmlNode definition = null; if (GetName(parentNode) == "id-g" || GetName(parentNode) == "h-g") { definition = parentNode.SelectSingleNode("div[@class='def_block']"); } if (definition == null) { definition = parentNode; } var temp = definition.SelectSingleNode("span[@class='ud']"); if (temp == null) { definition = definition.SelectSingleNode("span[@class='d']"); } else { definition = temp; } //An example itself card.Interpretation = getText(example.SelectSingleNode("span[@class='x']")); card.Sentence = (new Regex(" \\(=.*?\\)")).Replace(card.Interpretation, ""); card.Definition = getText(definition); card.Definition = card.Definition.Replace(" ", " "); card.UsaTranscription = getText(usaTranscription); card.GbrTranscription = getText(gbrTranscription); if (card.Interpretation == card.Sentence) { card.Interpretation = ""; } if (card.Definition == "") { reportStream.Write("Failure. Definition was not found. Link: " + link + " Example: '" + card.Sentence + "'\n"); } if (card.Sentence == "") { reportStream.Write("Failure. Example was not found. Link: " + link + "\n"); } var outStr = card.Sentence + "\t" + card.Interpretation + "\t" + word + "\t" + card.GbrTranscription + "\t" + card.UsaTranscription + "\t" + PrintList(card.Structure) + "\t" + card.Definition + "\t" + userLabels + "\n"; stream.Write(outStr); count_++; } }
public void ProcessWordlist(ref CardsStream stream, List<string> wordlist, string labels, bool crossreferenceFlag) { //Разбираем исходные страницы. Возможно, удастся ускорить работу засчёт распараллеливания. //Собираем ссылки в теле статьи и добавляем в список. //downloadPages(ref stream, GetArticleCrossrefenceLinkList(), labels); //Работаем с ссылками, находящиеся в блоке Search Results, в т.ч. и на само word) var crossreferenceLinks = new Hashtable(); string currentPageLink; foreach(String word in wordlist) { var page = (new HtmlWeb()).Load(searchPath + word); //Определяем, на какой по какой ссылке находится наше слово var currentPageLink_ = page.DocumentNode.SelectSingleNode("//li[@class='currentpage']/a"); currentPageLink = (currentPageLink_ != null) ? getCleanUrl(currentPageLink_) : ""; //Берём все слова из блока search results var linksNodes = page.DocumentNode.SelectNodes("//div[@id='relatedentries']/ul/li/a"); if (linksNodes != null) { foreach (HtmlNode link in linksNodes) { var url = getCleanUrl(link); if (crossreferenceLinks.ContainsKey(url) == false) { crossreferenceLinks.Add(url, false); } } } else { reportStream.Write("Failure. Page not found. Link: " + word + "\n"); } //Всё нужное получили, начинаем обрабатывать searchCrossreferenceLinkList var updatedWordList = new Hashtable(); foreach (string link in crossreferenceLinks.Keys) { if ((bool)crossreferenceLinks[link] == true) { continue; } var tranformedWordRegExp = new Regex("^(" + word.Replace(' ', '-') + "_\\d+)$", RegexOptions.IgnoreCase); //Первую страницу мы уже скачали. if (currentPageLink == link) { ParsePage(ref stream, page, word, labels); } else if (tranformedWordRegExp.Match(link).Success) { ParsePage(ref stream, (new HtmlWeb()).Load(searchPath + link), link, labels); } else if (crossreferenceFlag == true) { ParsePage(ref stream, (new HtmlWeb()).Load(searchPath + link), link, labels); } if (currentPageLink == link || tranformedWordRegExp.Match(link).Success || crossreferenceFlag == true) { Console.WriteLine("{0}", link); updatedWordList.Add(link, true); reportStream.Write("Success. Page was parsed. Link: " + link + "\n"); } } foreach (DictionaryEntry update in updatedWordList) { crossreferenceLinks[update.Key] = update.Value; } } reportStream.Write("Total: " + count_ + "\n"); reportStream.Save(); }
public void ProcessWordlist(ref CardsStream stream, List <string> wordlist, string labels, bool crossreferenceFlag) { //Разбираем исходные страницы. Возможно, удастся ускорить работу засчёт распараллеливания. //Собираем ссылки в теле статьи и добавляем в список. //downloadPages(ref stream, GetArticleCrossrefenceLinkList(), labels); //Работаем с ссылками, находящиеся в блоке Search Results, в т.ч. и на само word) var crossreferenceLinks = new Hashtable(); string currentPageLink; foreach (String word in wordlist) { var page = (new HtmlWeb()).Load(searchPath + word); //Определяем, на какой по какой ссылке находится наше слово var currentPageLink_ = page.DocumentNode.SelectSingleNode("//li[@class='currentpage']/a"); currentPageLink = (currentPageLink_ != null) ? getCleanUrl(currentPageLink_) : ""; //Берём все слова из блока search results var linksNodes = page.DocumentNode.SelectNodes("//div[@id='relatedentries']/ul/li/a"); if (linksNodes != null) { foreach (HtmlNode link in linksNodes) { var url = getCleanUrl(link); if (crossreferenceLinks.ContainsKey(url) == false) { crossreferenceLinks.Add(url, false); } } } else { reportStream.Write("Failure. Page not found. Link: " + word + "\n"); } //Всё нужное получили, начинаем обрабатывать searchCrossreferenceLinkList var updatedWordList = new Hashtable(); foreach (string link in crossreferenceLinks.Keys) { if ((bool)crossreferenceLinks[link] == true) { continue; } var tranformedWordRegExp = new Regex("^(" + word.Replace(' ', '-') + "_\\d+)$", RegexOptions.IgnoreCase); //Первую страницу мы уже скачали. if (currentPageLink == link) { ParsePage(ref stream, page, word, labels); } else if (tranformedWordRegExp.Match(link).Success) { ParsePage(ref stream, (new HtmlWeb()).Load(searchPath + link), link, labels); } else if (crossreferenceFlag == true) { ParsePage(ref stream, (new HtmlWeb()).Load(searchPath + link), link, labels); } if (currentPageLink == link || tranformedWordRegExp.Match(link).Success || crossreferenceFlag == true) { Console.WriteLine("{0}", link); updatedWordList.Add(link, true); reportStream.Write("Success. Page was parsed. Link: " + link + "\n"); } } foreach (DictionaryEntry update in updatedWordList) { crossreferenceLinks[update.Key] = update.Value; } } reportStream.Write("Total: " + count_ + "\n"); reportStream.Save(); }