public void ProcessWordlist(ref CardsStream stream, List <string> wordlist, string labels, int limit) { labels = (labels == "") ? "lingvo " + labels : "lingvo " + labels + " "; foreach (var word in wordlist) { var offset = 0; reportStream.Write("Success. Page was parsed. Word: " + word + "\n"); Console.WriteLine("{0}", word); while (offset < limit) { var page = new HtmlDocument(); page.LoadHtml(DownloadExamples(word, offset)); var examples = page.DocumentNode.SelectNodes("//tr[@class='item first']"); if (examples == null) { reportStream.Write("Failure. Examples was not found. Word: " + word + "\n"); break; } stream.Write(Parse(examples, labels + word)); count_ += examples.Count; examples = page.DocumentNode.SelectNodes("//tr[@class='item']"); if (examples == null) { break; } stream.Write(Parse(examples, labels + word)); count_ += examples.Count; offset += 5; } } reportStream.Write("Total: " + count_ + "\n"); reportStream.Save(); return; }
public void ProcessWordlist(ref CardsStream stream, List <string> wordlist, string labels, bool crossreferenceFlag) { //Разбираем исходные страницы. Возможно, удастся ускорить работу засчёт распараллеливания. //Собираем ссылки в теле статьи и добавляем в список. //downloadPages(ref stream, GetArticleCrossrefenceLinkList(), labels); //Работаем с ссылками, находящиеся в блоке Search Results, в т.ч. и на само word) var crossreferenceLinks = new Hashtable(); foreach (String word in wordlist) { var page = (new HtmlWeb()).Load(searchPath + word); ParsePage(ref stream, page, word, labels); Console.WriteLine("{0}", word); reportStream.Write("Success. Page was parsed. Link: " + word + "\n"); //Берём все слова из блока search results var linksNodes = page.DocumentNode.SelectNodes("//div[@class='entrylist']/ul/li/a"); if (linksNodes != null) { foreach (HtmlNode link in linksNodes) { var url = getCleanUrl(link); if (crossreferenceLinks.ContainsKey(url) == false) { crossreferenceLinks.Add(url, false); } } } var updatedWordList = new Hashtable(); foreach (string link in crossreferenceLinks.Keys) { if ((bool)crossreferenceLinks[link] == true) { continue; } var tranformedWordRegExp = new Regex("^(" + word.Replace(' ', '-') + "_\\d+)$", RegexOptions.IgnoreCase); if (crossreferenceFlag == true || tranformedWordRegExp.Match(link).Success) { ParsePage(ref stream, (new HtmlWeb()).Load(searchPath + link), link, labels); Console.WriteLine("{0}", link); updatedWordList.Add(link, true); reportStream.Write("Success. Page was parsed. Link: " + link + "\n"); } } foreach (DictionaryEntry update in updatedWordList) { crossreferenceLinks[update.Key] = update.Value; } } reportStream.Write("Total: " + count_ + "\n"); reportStream.Save(); }
public static void Main(string[] args) { ParserMask useParser = ParserMask.None; var inputPath = ""; var outputPath = String.Empty; var wordlist = new List <String>(); var labels = ""; var relatedFlag = false; var examplesLimit = Int32.MaxValue; var vocabularyComDomain = String.Empty; CardsStream output; StreamReader input; for (int i = 0; i < args.Length; i++) { if (args[i] == "-oald8") { outputPath = "oald8"; useParser |= ParserMask.Oald8; } else if (args[i] == "-macmillan") { outputPath = "macmillan"; useParser |= ParserMask.Macmillan; } else if (args[i] == "-vocabcom") { outputPath = "vocabulary.com"; useParser |= ParserMask.VocabularyCom; } else if (args[i] == "-lingvo") { outputPath = "lingvo"; useParser |= ParserMask.LingvoRu; } else if (args[i] == "-l" && i + 1 < args.Length) { labels = (args[i + 1]).Trim(); i++; } else if (args[i] == "-limit" && i + 1 < args.Length) { examplesLimit = Convert.ToInt32((args[i + 1]).Trim()); i++; } else if (args[i] == "-vdomain" && i + 1 < args.Length) { vocabularyComDomain = (args[i + 1]).Trim(); i++; } else if (args[i] == "-p" && i + 1 < args.Length) { inputPath = (args[i + 1]).Trim(); i++; } else if (args[i] == "-related") { relatedFlag = true; } } if (inputPath == "") { Console.Write("Where's a listname, uh? You should enter -p <path_to_you_wordlist>"); Console.ReadKey(); return; } else { try { input = new StreamReader(inputPath); while (input.EndOfStream == false) { // var word = (new Regex("[^- 0-9a-zA-Z']+")).Replace(input.ReadLine(), "").Trim(); // if (wordlist.Contains(word) == false) { wordlist.Add(word); } } input.Close(); } catch (FileNotFoundException e) { Console.Write("Wrong path: {0}", inputPath); } catch (DirectoryNotFoundException e) { Console.Write("Wrong directory: {0}", inputPath); } } if (labels.Length != 0) { outputPath = "./" + outputPath + " " + labels; } else { outputPath = "./" + outputPath; } outputPath += " " + DateTime.Now.ToString("yyyy.MM.dd HH-mm-ss") + ".txt"; output = new CardsStream(outputPath, 100000); if ((useParser & ParserMask.Oald8) != ParserMask.None) { var parser = new Oald8(); parser.ProcessWordlist(ref output, wordlist, labels, relatedFlag); Console.WriteLine("\nCount: {0}\n", parser.count); } else if ((useParser & ParserMask.Macmillan) != ParserMask.None) { var parser = new Macmillan(); parser.ProcessWordlist(ref output, wordlist, labels, relatedFlag); Console.WriteLine("\nCount: {0}\n", parser.count); } else if ((useParser & ParserMask.VocabularyCom) != ParserMask.None) { var parser = new VocabularyCom(); parser.ProcessWordlist(ref output, wordlist, labels, vocabularyComDomain, examplesLimit); Console.WriteLine("\nCount: {0}\n", parser.count); } else if ((useParser & ParserMask.LingvoRu) != ParserMask.None) { var parser = new LingvoRu(); parser.ProcessWordlist(ref output, wordlist, labels, examplesLimit); Console.WriteLine("\nCount: {0}\n", parser.count); } if (useParser != ParserMask.None) { output.Save(); } }
public static void Main(string[] args) { ParserMask useParser = ParserMask.None; var inputPath = ""; var outputPath = String.Empty; var wordlist = new List<String>(); var labels = ""; var relatedFlag = false; var examplesLimit = Int32.MaxValue; var vocabularyComDomain = String.Empty; CardsStream output; StreamReader input; for (int i = 0; i < args.Length; i++) { if (args[i] == "-oald8") { outputPath = "oald8"; useParser |= ParserMask.Oald8; } else if (args[i] == "-macmillan") { outputPath = "macmillan"; useParser |= ParserMask.Macmillan; } else if (args[i] == "-vocabcom") { outputPath = "vocabulary.com"; useParser |= ParserMask.VocabularyCom; } else if (args[i] == "-lingvo") { outputPath = "lingvo"; useParser |= ParserMask.LingvoRu; } else if (args[i] == "-l" && i + 1 < args.Length) { labels = (args[i + 1]).Trim(); i++; } else if (args[i] == "-limit" && i + 1 < args.Length) { examplesLimit = Convert.ToInt32((args[i + 1]).Trim()); i++; } else if (args[i] == "-vdomain" && i + 1 < args.Length) { vocabularyComDomain = (args[i + 1]).Trim(); i++; } else if (args[i] == "-p" && i + 1 < args.Length) { inputPath = (args[i + 1]).Trim(); i++; } else if (args[i] == "-related") { relatedFlag = true; } } if (inputPath == "") { Console.Write("Where's a listname, uh? You should enter -p <path_to_you_wordlist>"); Console.ReadKey(); return; } else { try { input = new StreamReader(inputPath); while (input.EndOfStream == false) { var word = (new Regex("[^- 0-9a-zA-Z']+")).Replace(input.ReadLine(), "").Trim(); if (wordlist.Contains(word) == false) { wordlist.Add(word); } } input.Close(); } catch (FileNotFoundException e) { Console.Write("Wrong path: {0}", inputPath); } catch (DirectoryNotFoundException e) { Console.Write("Wrong directory: {0}", inputPath); } } if (labels.Length != 0) { outputPath = "./" + outputPath + " " + labels; } else { outputPath = "./" + outputPath; } outputPath += " " + DateTime.Now.ToString("yyyy.MM.dd HH-mm-ss") + ".txt"; output = new CardsStream(outputPath, 100000); if ((useParser & ParserMask.Oald8) != ParserMask.None) { var parser = new Oald8(); parser.ProcessWordlist(ref output, wordlist, labels, relatedFlag); Console.WriteLine("\nCount: {0}\n", parser.count); } else if ((useParser & ParserMask.Macmillan) != ParserMask.None) { var parser = new Macmillan(); parser.ProcessWordlist(ref output, wordlist, labels, relatedFlag); Console.WriteLine("\nCount: {0}\n", parser.count); } else if ((useParser & ParserMask.VocabularyCom) != ParserMask.None) { var parser = new VocabularyCom(); parser.ProcessWordlist(ref output, wordlist, labels, vocabularyComDomain, examplesLimit); Console.WriteLine("\nCount: {0}\n", parser.count); } else if ((useParser & ParserMask.LingvoRu) != ParserMask.None) { var parser = new LingvoRu(); parser.ProcessWordlist(ref output, wordlist, labels, examplesLimit); Console.WriteLine("\nCount: {0}\n", parser.count); } if (useParser != ParserMask.None) { output.Save(); } }
public void ProcessWordlist(ref CardsStream stream, List <string> wordlist, string userLabels, string domain, int limit) { var step_ = (limit < 48) ? limit : 48; userLabels = "vocabulary_com " + userLabels; foreach (String word in wordlist) { var offset = 0; var step = step_; var wordLabel = word.Replace(' ', '-'); var primaryDefinitions = String.Empty; var fullDefinitions = String.Empty; var page = (new HtmlWeb()).Load(definitionsPath + word).DocumentNode; Console.WriteLine("{0}", word); //Primary var definitionsNodes = page.SelectNodes("//div[@class='def']"); if (definitionsNodes != null) { foreach (var p in definitionsNodes) { var href = "#s" + (new Regex(@"quickDef(\d+)")).Replace(p.Attributes["id"].Value, "$1"); primaryDefinitions += "<i><b>" + getText(page.SelectSingleNode("//a[@href='" + href + "']")) + "</i></b> " + p.InnerText + "<br/>"; } } definitionsNodes = page.SelectNodes("//div[@class='def selected']"); if (definitionsNodes != null) { foreach (var p in definitionsNodes) { var href = "#s" + (new Regex(@"quickDef(\d+)")).Replace(p.Attributes["id"].Value, "$1"); primaryDefinitions += "<i><b>" + getText(page.SelectSingleNode("//a[@href='" + href + "']")) + "</i></b> " + p.InnerText + "<br/>"; } } if (primaryDefinitions.Length == 0) { definitionsNodes = page.SelectNodes("//h3[@class='definition']"); if (definitionsNodes != null) { foreach (var f in definitionsNodes) { var partOfSpeech = f.SelectSingleNode("a"); var currentDefinition = new Regex("[\t\r\n]").Replace(getText(f), ""); currentDefinition = (new Regex(@"^\w+\s+(.*?)$")).Replace(currentDefinition, "$1"); primaryDefinitions += "<i><b>" + getText(partOfSpeech) + "</i></b> " + currentDefinition + "<br/>"; } primaryDefinitions = primaryDefinitions.Substring(0, primaryDefinitions.Length - 5); } } do { if (limit - offset < step) { step = limit - offset; } if (step <= 0) { break; } try { var json_ = System.Text.UTF8Encoding.ASCII.GetString(client.DownloadData(examplesPath + word + "&maxResults=" + step + "&startOffset=" + offset + "&filter=0&domain=" + domain)); json_ = (new Regex(@"\$d\((.*?)\)")).Replace(json_, @"""$1"""); var json = JObject.Parse(json_); if (offset == 0) { var hits = (int)json.SelectToken("result.totalHits"); reportStream.Write("Success. Word: " + word + " Hits: " + hits + "\n"); Console.WriteLine("Hits: {0}", hits); if (hits == 0) { reportStream.Write("Failure. Examples was not found. Word: " + word + "\n"); } } Console.WriteLine("Processed: {0}", offset); offset += step; foreach (var _ in json.SelectToken("result.sentences")) { count_++; var example = new StringBuilder() .Append(SafeTrim((string)_.SelectToken("sentence"))) .Append("%%!!%%") .Append(SafeTrim((string)_.SelectToken("sentence"))) .Append("%%!!%%") .Append(word) .Append("%%!!%%") .Append(primaryDefinitions) .Append("%%!!%%") //.Append(fullDefinitions) //.Append("%%!!%%") .Append(wordLabel) .Append(" ") .Append(userLabels); stream.Write((new Regex("[\t\n\r]").Replace(example.ToString(), "")).Replace("%%!!%%", "\t") + "\n"); } } catch { continue; } }while (step > 0); Console.WriteLine("Processed: {0}\n", offset); } reportStream.Write("Total: " + count_ + "\n"); reportStream.Save(); }
public void ProcessWordlist(ref CardsStream stream, List <string> wordlist, string labels, bool crossreferenceFlag) { //Разбираем исходные страницы. Возможно, удастся ускорить работу засчёт распараллеливания. //Собираем ссылки в теле статьи и добавляем в список. //downloadPages(ref stream, GetArticleCrossrefenceLinkList(), labels); //Работаем с ссылками, находящиеся в блоке Search Results, в т.ч. и на само word) var crossreferenceLinks = new Hashtable(); string currentPageLink; foreach (String word in wordlist) { var page = (new HtmlWeb()).Load(searchPath + word); //Определяем, на какой по какой ссылке находится наше слово var currentPageLink_ = page.DocumentNode.SelectSingleNode("//li[@class='currentpage']/a"); currentPageLink = (currentPageLink_ != null) ? getCleanUrl(currentPageLink_) : ""; //Берём все слова из блока search results var linksNodes = page.DocumentNode.SelectNodes("//div[@id='relatedentries']/ul/li/a"); if (linksNodes != null) { foreach (HtmlNode link in linksNodes) { var url = getCleanUrl(link); if (crossreferenceLinks.ContainsKey(url) == false) { crossreferenceLinks.Add(url, false); } } } else { reportStream.Write("Failure. Page not found. Link: " + word + "\n"); } //Всё нужное получили, начинаем обрабатывать searchCrossreferenceLinkList var updatedWordList = new Hashtable(); foreach (string link in crossreferenceLinks.Keys) { if ((bool)crossreferenceLinks[link] == true) { continue; } var tranformedWordRegExp = new Regex("^(" + word.Replace(' ', '-') + "_\\d+)$", RegexOptions.IgnoreCase); //Первую страницу мы уже скачали. if (currentPageLink == link) { ParsePage(ref stream, page, word, labels); } else if (tranformedWordRegExp.Match(link).Success) { ParsePage(ref stream, (new HtmlWeb()).Load(searchPath + link), link, labels); } else if (crossreferenceFlag == true) { ParsePage(ref stream, (new HtmlWeb()).Load(searchPath + link), link, labels); } if (currentPageLink == link || tranformedWordRegExp.Match(link).Success || crossreferenceFlag == true) { Console.WriteLine("{0}", link); updatedWordList.Add(link, true); reportStream.Write("Success. Page was parsed. Link: " + link + "\n"); } } foreach (DictionaryEntry update in updatedWordList) { crossreferenceLinks[update.Key] = update.Value; } } reportStream.Write("Total: " + count_ + "\n"); reportStream.Save(); }