Пример #1
0
        public void ProcessWordlist(ref CardsStream stream, List<string> wordlist, string labels, bool crossreferenceFlag)
        {
            //Разбираем исходные страницы. Возможно, удастся ускорить работу засчёт распараллеливания.
            //Собираем ссылки в теле статьи и добавляем в список.
            //downloadPages(ref stream, GetArticleCrossrefenceLinkList(), labels);

            //Работаем с ссылками, находящиеся в блоке Search Results, в т.ч. и на само word)
            var crossreferenceLinks = new Hashtable();

            foreach (String word in wordlist)
            {
                var page = (new HtmlWeb()).Load(searchPath + word);

                ParsePage(ref stream, page, word, labels);
                Console.WriteLine("{0}", word);
                reportStream.Write("Success. Page was parsed. Link: " + word + "\n");

                //Берём все слова из блока search results
                var linksNodes = page.DocumentNode.SelectNodes("//div[@class='entrylist']/ul/li/a");
                if (linksNodes != null)
                {
                    foreach (HtmlNode link in linksNodes)
                    {
                        var url = getCleanUrl(link);
                        if (crossreferenceLinks.ContainsKey(url) == false)
                        {
                            crossreferenceLinks.Add(url, false);
                        }
                    }
                }

                var updatedWordList = new Hashtable();

                foreach (string link in crossreferenceLinks.Keys)
                {
                    if ((bool)crossreferenceLinks[link] == true)
                    {
                        continue;
                    }

                    var tranformedWordRegExp = new Regex("^(" + word.Replace(' ', '-') + "_\\d+)$", RegexOptions.IgnoreCase);

                    if (crossreferenceFlag == true || tranformedWordRegExp.Match(link).Success)
                    {
                        ParsePage(ref stream, (new HtmlWeb()).Load(searchPath + link), link, labels);

                        Console.WriteLine("{0}", link);
                        updatedWordList.Add(link, true);
                        reportStream.Write("Success. Page was parsed. Link: " + link + "\n");
                    }
                }

                foreach (DictionaryEntry update in updatedWordList)
                {
                    crossreferenceLinks[update.Key] = update.Value;
                }
            }
            reportStream.Write("Total: " + count_ + "\n");
            reportStream.Save();
        }
        void ParsePage(ref CardsStream stream, HtmlDocument document, string link, string userLabels)
        {
            var examples = document.DocumentNode.SelectNodes("//div[@class='EXAMPLES']");

            if (examples == null)
            {
                reportStream.Write("Failure. Examples was not found. Link: " + link + "\n");
                return;
            }

            var gbrTranscription = document.DocumentNode.SelectSingleNode("//span[@class='PRON']");
            var word             = getText(document.DocumentNode.SelectSingleNode("//span[@class='BASE']"));

            var wordLabel = word.Replace(' ', '-');

            userLabels = (userLabels == "") ? "macmillan " + wordLabel : "macmillan " + wordLabel + " " + userLabels;

            foreach (HtmlNode example in examples)
            {
                var card       = new Card();
                var parentNode = example.ParentNode;

                //Getting a structure
                var structure = example.SelectSingleNode("strong");
                if (structure == null)
                {
                    //DIV#SENSE_BODY -> DIV -> LI
                    structure = parentNode.ParentNode.ParentNode.SelectSingleNode("div/h2/span[@class='BASE']");
                }

                card.Sentence         = getText(example.SelectSingleNode("p[@id='EXAMPLE']"));
                card.Definition       = getText(parentNode.SelectSingleNode("span[@class='DEFINITION']"));
                card.GbrTranscription = getText(gbrTranscription).Replace("/", "");
                card.SimpleStructure  = getText(structure).Replace(":", "");


                if (card.Definition == "")
                {
                    reportStream.Write("Failure. Definition was not found. Link: " + link + " Example: '" + card.Sentence + "'\n");
                }
                if (card.Sentence == "")
                {
                    reportStream.Write("Failure. Example was not found. Link: " + link + "\n");
                }

                var outStr = card.Sentence + "\t" + card.Interpretation +
                             "\t" + word + "\t" + card.GbrTranscription +
                             "\t" + card.UsaTranscription + "\t" +
                             card.SimpleStructure + "\t" + card.Definition +
                             "\t" + userLabels + "\n";
                stream.Write(outStr);
                count_++;
            }
        }
        void ParsePage(ref CardsStream stream, HtmlDocument document, string link, string userLabels)
        {
            var examples = document.DocumentNode.SelectNodes("//div[@class='EXAMPLES']");

            if (examples == null)
            {
                reportStream.Write("Failure. Examples was not found. Link: " + link + "\n");
                return;
            }

            var gbrTranscription = document.DocumentNode.SelectSingleNode("//span[@class='PRON']");
            var word = getText(document.DocumentNode.SelectSingleNode("//span[@class='BASE']"));

            var wordLabel = word.Replace(' ', '-');
            userLabels = (userLabels == "") ? "macmillan " + wordLabel : "macmillan " + wordLabel + " " + userLabels;

            foreach (HtmlNode example in examples)
            {
                var card = new Card();
                var parentNode = example.ParentNode;

                //Getting a structure
                var structure = example.SelectSingleNode("strong");
                if (structure == null)
                {
                    //DIV#SENSE_BODY -> DIV -> LI
                    structure = parentNode.ParentNode.ParentNode.SelectSingleNode("div/h2/span[@class='BASE']");
                }

                card.Sentence = getText(example.SelectSingleNode("p[@id='EXAMPLE']"));
                card.Definition = getText(parentNode.SelectSingleNode("span[@class='DEFINITION']"));
                card.GbrTranscription = getText(gbrTranscription).Replace("/", "");
                card.SimpleStructure = getText(structure).Replace(":", "");

                if (card.Definition == "")
                {
                    reportStream.Write("Failure. Definition was not found. Link: " + link + " Example: '" + card.Sentence + "'\n");
                }
                if (card.Sentence == "")
                {
                    reportStream.Write("Failure. Example was not found. Link: " + link + "\n");
                }

                var outStr = card.Sentence + "\t" + card.Interpretation +
                   "\t" + word + "\t" + card.GbrTranscription +
                   "\t" + card.UsaTranscription + "\t" +
                   card.SimpleStructure + "\t" + card.Definition +
                   "\t" + userLabels + "\n";
                stream.Write(outStr);
                count_++;
            }
        }
Пример #4
0
        public void ProcessWordlist(ref CardsStream stream, List<string> wordlist, string labels, int limit)
        {
            labels = (labels == "") ? "lingvo " + labels : "lingvo " + labels + " ";

            foreach (var word in wordlist)
            {
                var offset = 0;

                reportStream.Write("Success. Page was parsed. Word: " + word + "\n");
                Console.WriteLine("{0}", word);

                while (offset < limit)
                {

                    var page = new HtmlDocument();
                    page.LoadHtml(DownloadExamples(word, offset));

                    var examples = page.DocumentNode.SelectNodes("//tr[@class='item first']");
                    if (examples == null)
                    {
                        reportStream.Write("Failure. Examples was not found. Word: " + word + "\n");
                        break;
                    }

                    stream.Write(Parse(examples, labels + word));
                    count_ += examples.Count;

                    examples = page.DocumentNode.SelectNodes("//tr[@class='item']");
                    if (examples == null)
                    {
                        break;
                    }

                    stream.Write(Parse(examples, labels + word));
                    count_ += examples.Count;

                    offset += 5;
                }
            }

            reportStream.Write("Total: " + count_ + "\n");
            reportStream.Save();
            return;
        }
Пример #5
0
        public void ProcessWordlist(ref CardsStream stream, List <string> wordlist, string labels, int limit)
        {
            labels = (labels == "") ? "lingvo " + labels : "lingvo " + labels + " ";

            foreach (var word in wordlist)
            {
                var offset = 0;

                reportStream.Write("Success. Page was parsed. Word: " + word + "\n");
                Console.WriteLine("{0}", word);

                while (offset < limit)
                {
                    var page = new HtmlDocument();
                    page.LoadHtml(DownloadExamples(word, offset));

                    var examples = page.DocumentNode.SelectNodes("//tr[@class='item first']");
                    if (examples == null)
                    {
                        reportStream.Write("Failure. Examples was not found. Word: " + word + "\n");
                        break;
                    }

                    stream.Write(Parse(examples, labels + word));
                    count_ += examples.Count;

                    examples = page.DocumentNode.SelectNodes("//tr[@class='item']");
                    if (examples == null)
                    {
                        break;
                    }

                    stream.Write(Parse(examples, labels + word));
                    count_ += examples.Count;

                    offset += 5;
                }
            }

            reportStream.Write("Total: " + count_ + "\n");
            reportStream.Save();
            return;
        }
        void ParsePage(ref CardsStream stream, HtmlDocument document, string link, string userLabels)
        {
            var examples = document.DocumentNode.SelectNodes("//span[@class='x-g']");

            if (examples == null)
            {
                reportStream.Write("Failure. Examples was not found. Link: " + link + "\n");
                return;
            }

            //Transcription
            var usaTranscription = document.DocumentNode.SelectSingleNode("//span[@class='y']");
            var gbrTranscription = document.DocumentNode.SelectSingleNode("//span[@class='i']");

            //Word "name"
            var word = getText(document.DocumentNode.SelectSingleNode("//h2[@class='h']"));

            //Label
            var wordLabel = word.Replace(' ', '-');

            userLabels = (userLabels == "") ? "oald8 " + wordLabel : "oald8 " + wordLabel + " " + userLabels;

            foreach (HtmlNode example in examples)
            {
                var card       = new Card();
                var parentNode = example.ParentNode;

                //Getting a structure
                var structure1 = example.SelectSingleNode("span[@class='cf']");
                if (GetName(parentNode) == "pv-g")
                {
                    var structure2 = parentNode.SelectSingleNode("h4[@class='pv']");
                    if (structure2 != null)
                    {
                        card.Structure.Add(structure2.InnerText);
                    }
                    if (structure1 != null)
                    {
                        card.Structure.Add(structure1.InnerText);
                    }
                }
                else if (GetName(parentNode) == "n-g" && GetName(parentNode.ParentNode) == "pv-g")
                {
                    var structure2 = parentNode.SelectSingleNode("span[@class='vs-g']");
                    var structure3 = parentNode.ParentNode.SelectSingleNode("h4[@class='pv']");
                    if (structure3 != null)
                    {
                        card.Structure.Add(structure3.InnerText);
                    }
                    if (structure2 != null)
                    {
                        card.Structure.Add(structure2.InnerText);
                    }
                    if (structure1 != null)
                    {
                        card.Structure.Add(structure1.InnerText);
                    }
                }
                else if (GetName(parentNode) == "n-g")
                {
                    var structure2 = parentNode.SelectSingleNode("span[@class='cf']");
                    if (structure2 != null)
                    {
                        card.Structure.Add(structure2.InnerText);
                    }
                    if (structure1 != null)
                    {
                        card.Structure.Add(structure1.InnerText);
                    }
                }
                else if (GetName(parentNode) == "id-g")
                {
                    var structure2 = parentNode.SelectSingleNode("h4[@class='id']");
                    if (structure2 != null)
                    {
                        card.Structure.Add(structure2.InnerText);
                    }
                    if (structure1 != null)
                    {
                        card.Structure.Add(structure1.InnerText);
                    }
                }
                else if (GetName(parentNode) == "h-g")
                {
                    var structure2 = parentNode.SelectSingleNode("span[@class='cf']");
                    if (structure2 != null)
                    {
                        card.Structure.Add(structure2.InnerText);
                    }
                    if (structure1 != null)
                    {
                        card.Structure.Add(structure1.InnerText);
                    }
                }

                //Getting a definition
                HtmlNode definition = null;

                if (GetName(parentNode) == "id-g" || GetName(parentNode) == "h-g")
                {
                    definition = parentNode.SelectSingleNode("div[@class='def_block']");
                }
                if (definition == null)
                {
                    definition = parentNode;
                }

                var temp = definition.SelectSingleNode("span[@class='ud']");
                if (temp == null)
                {
                    definition = definition.SelectSingleNode("span[@class='d']");
                }
                else
                {
                    definition = temp;
                }

                //An example itself
                card.Interpretation = getText(example.SelectSingleNode("span[@class='x']"));
                card.Sentence       = (new Regex(" \\(=.*?\\)")).Replace(card.Interpretation, "");

                card.Definition = getText(definition);
                card.Definition = card.Definition.Replace("    ", " ");

                card.UsaTranscription = getText(usaTranscription);
                card.GbrTranscription = getText(gbrTranscription);

                if (card.Interpretation == card.Sentence)
                {
                    card.Interpretation = "";
                }
                if (card.Definition == "")
                {
                    reportStream.Write("Failure. Definition was not found. Link: " + link + " Example: '" + card.Sentence + "'\n");
                }
                if (card.Sentence == "")
                {
                    reportStream.Write("Failure. Example was not found. Link: " + link + "\n");
                }

                var outStr = card.Sentence + "\t" + card.Interpretation +
                             "\t" + word + "\t" + card.GbrTranscription +
                             "\t" + card.UsaTranscription + "\t" +
                             PrintList(card.Structure) + "\t" + card.Definition +
                             "\t" + userLabels + "\n";
                stream.Write(outStr);
                count_++;
            }
        }
Пример #7
0
        public void ProcessWordlist(ref CardsStream stream, List<string> wordlist, string userLabels, string domain, int limit)
        {
            var step_ = (limit < 48) ? limit : 48;
            userLabels = "vocabulary_com " + userLabels;

            foreach (String word in wordlist)
            {
                var offset = 0;
                var step = step_;
                var wordLabel = word.Replace(' ', '-');
                var primaryDefinitions = String.Empty;
                var fullDefinitions = String.Empty;
                var page = (new HtmlWeb()).Load(definitionsPath + word).DocumentNode;
                Console.WriteLine("{0}", word);

                //Primary
                var definitionsNodes = page.SelectNodes("//div[@class='def']");

                if (definitionsNodes != null)
                {
                    foreach (var p in definitionsNodes)
                    {
                        var href = "#s" + (new Regex(@"quickDef(\d+)")).Replace(p.Attributes["id"].Value, "$1");
                        primaryDefinitions += "<i><b>" + getText(page.SelectSingleNode("//a[@href='" + href +"']")) + "</i></b> " + p.InnerText + "<br/>";
                    }
                }

                definitionsNodes = page.SelectNodes("//div[@class='def selected']");

                if (definitionsNodes != null)
                {
                    foreach (var p in definitionsNodes)
                    {
                        var href = "#s" + (new Regex(@"quickDef(\d+)")).Replace(p.Attributes["id"].Value, "$1");
                        primaryDefinitions += "<i><b>" + getText(page.SelectSingleNode("//a[@href='" + href + "']")) + "</i></b> " + p.InnerText + "<br/>";
                    }
                }

                if (primaryDefinitions.Length == 0)
                {
                    definitionsNodes = page.SelectNodes("//h3[@class='definition']");
                    if (definitionsNodes != null)
                    {
                        foreach (var f in definitionsNodes)
                        {
                            var partOfSpeech = f.SelectSingleNode("a");
                            var currentDefinition = new Regex("[\t\r\n]").Replace(getText(f), "");
                            currentDefinition = (new Regex(@"^\w+\s+(.*?)$")).Replace(currentDefinition, "$1");
                            primaryDefinitions += "<i><b>" + getText(partOfSpeech) + "</i></b> " + currentDefinition + "<br/>";
                        }

                        primaryDefinitions = primaryDefinitions.Substring(0, primaryDefinitions.Length - 5);
                    }
                }

                do
                {
                    if (limit - offset < step)
                    {
                        step = limit - offset;
                    }

                    if (step <= 0)
                    {
                        break;
                    }

                    try
                    {
                        var json_ = System.Text.UTF8Encoding.ASCII.GetString(client.DownloadData(examplesPath + word + "&maxResults=" + step + "&startOffset=" + offset + "&filter=0&domain=" + domain));

                        json_ = (new Regex(@"\$d\((.*?)\)")).Replace(json_, @"""$1""");
                        var json = JObject.Parse(json_);

                        if (offset == 0)
                        {
                            var hits = (int)json.SelectToken("result.totalHits");

                            reportStream.Write("Success. Word: " + word + " Hits: " + hits + "\n");
                            Console.WriteLine("Hits: {0}", hits);

                            if (hits == 0)
                            {
                                reportStream.Write("Failure. Examples was not found. Word: " + word + "\n");
                            }
                        }

                        Console.WriteLine("Processed: {0}", offset);

                        offset += step;

                        foreach (var _ in json.SelectToken("result.sentences"))
                        {
                            count_++;
                            var example = new StringBuilder()
                                .Append(SafeTrim((string)_.SelectToken("sentence")))
                                .Append("%%!!%%")
                                .Append(SafeTrim((string)_.SelectToken("sentence")))
                                .Append("%%!!%%")
                                .Append(word)
                                .Append("%%!!%%")
                                .Append(primaryDefinitions)
                                .Append("%%!!%%")
                                //.Append(fullDefinitions)
                                //.Append("%%!!%%")
                                .Append(wordLabel)
                                .Append(" ")
                                .Append(userLabels);

                            stream.Write((new Regex("[\t\n\r]").Replace(example.ToString(), "")).Replace("%%!!%%", "\t") + "\n");
                        }
                    }
                    catch
                    {
                        continue;
                    }
                }
                while (step > 0);

                Console.WriteLine("Processed: {0}\n", offset);

            }
            reportStream.Write("Total: " + count_ + "\n");
            reportStream.Save();
        }
Пример #8
0
        public void ProcessWordlist(ref CardsStream stream, List <string> wordlist, string labels, bool crossreferenceFlag)
        {
            //Разбираем исходные страницы. Возможно, удастся ускорить работу засчёт распараллеливания.
            //Собираем ссылки в теле статьи и добавляем в список.
            //downloadPages(ref stream, GetArticleCrossrefenceLinkList(), labels);

            //Работаем с ссылками, находящиеся в блоке Search Results, в т.ч. и на само word)
            var crossreferenceLinks = new Hashtable();

            foreach (String word in wordlist)
            {
                var page = (new HtmlWeb()).Load(searchPath + word);

                ParsePage(ref stream, page, word, labels);
                Console.WriteLine("{0}", word);
                reportStream.Write("Success. Page was parsed. Link: " + word + "\n");

                //Берём все слова из блока search results
                var linksNodes = page.DocumentNode.SelectNodes("//div[@class='entrylist']/ul/li/a");
                if (linksNodes != null)
                {
                    foreach (HtmlNode link in linksNodes)
                    {
                        var url = getCleanUrl(link);
                        if (crossreferenceLinks.ContainsKey(url) == false)
                        {
                            crossreferenceLinks.Add(url, false);
                        }
                    }
                }


                var updatedWordList = new Hashtable();

                foreach (string link in crossreferenceLinks.Keys)
                {
                    if ((bool)crossreferenceLinks[link] == true)
                    {
                        continue;
                    }

                    var tranformedWordRegExp = new Regex("^(" + word.Replace(' ', '-') + "_\\d+)$", RegexOptions.IgnoreCase);

                    if (crossreferenceFlag == true || tranformedWordRegExp.Match(link).Success)
                    {
                        ParsePage(ref stream, (new HtmlWeb()).Load(searchPath + link), link, labels);

                        Console.WriteLine("{0}", link);
                        updatedWordList.Add(link, true);
                        reportStream.Write("Success. Page was parsed. Link: " + link + "\n");
                    }
                }

                foreach (DictionaryEntry update in updatedWordList)
                {
                    crossreferenceLinks[update.Key] = update.Value;
                }
            }
            reportStream.Write("Total: " + count_ + "\n");
            reportStream.Save();
        }
Пример #9
0
        public static void Main(string[] args)
        {
            ParserMask useParser           = ParserMask.None;
            var        inputPath           = "";
            var        outputPath          = String.Empty;
            var        wordlist            = new List <String>();
            var        labels              = "";
            var        relatedFlag         = false;
            var        examplesLimit       = Int32.MaxValue;
            var        vocabularyComDomain = String.Empty;

            CardsStream  output;
            StreamReader input;

            for (int i = 0; i < args.Length; i++)
            {
                if (args[i] == "-oald8")
                {
                    outputPath = "oald8";
                    useParser |= ParserMask.Oald8;
                }
                else if (args[i] == "-macmillan")
                {
                    outputPath = "macmillan";
                    useParser |= ParserMask.Macmillan;
                }
                else if (args[i] == "-vocabcom")
                {
                    outputPath = "vocabulary.com";
                    useParser |= ParserMask.VocabularyCom;
                }
                else if (args[i] == "-lingvo")
                {
                    outputPath = "lingvo";
                    useParser |= ParserMask.LingvoRu;
                }

                else if (args[i] == "-l" && i + 1 < args.Length)
                {
                    labels = (args[i + 1]).Trim();
                    i++;
                }
                else if (args[i] == "-limit" && i + 1 < args.Length)
                {
                    examplesLimit = Convert.ToInt32((args[i + 1]).Trim());
                    i++;
                }

                else if (args[i] == "-vdomain" && i + 1 < args.Length)
                {
                    vocabularyComDomain = (args[i + 1]).Trim();
                    i++;
                }

                else if (args[i] == "-p" && i + 1 < args.Length)
                {
                    inputPath = (args[i + 1]).Trim();
                    i++;
                }

                else if (args[i] == "-related")
                {
                    relatedFlag = true;
                }
            }

            if (inputPath == "")
            {
                Console.Write("Where's a listname, uh? You should enter -p <path_to_you_wordlist>");
                Console.ReadKey();
                return;
            }
            else
            {
                try
                {
                    input = new StreamReader(inputPath);
                    while (input.EndOfStream == false)
                    {
                        //
                        var word = (new Regex("[^- 0-9a-zA-Z']+")).Replace(input.ReadLine(), "").Trim();
                        //
                        if (wordlist.Contains(word) == false)
                        {
                            wordlist.Add(word);
                        }
                    }
                    input.Close();
                }
                catch (FileNotFoundException e)
                {
                    Console.Write("Wrong path: {0}", inputPath);
                }
                catch (DirectoryNotFoundException e)
                {
                    Console.Write("Wrong directory: {0}", inputPath);
                }
            }


            if (labels.Length != 0)
            {
                outputPath = "./" + outputPath + " " + labels;
            }
            else
            {
                outputPath = "./" + outputPath;
            }

            outputPath += " " + DateTime.Now.ToString("yyyy.MM.dd HH-mm-ss") + ".txt";

            output = new CardsStream(outputPath, 100000);

            if ((useParser & ParserMask.Oald8) != ParserMask.None)
            {
                var parser = new Oald8();
                parser.ProcessWordlist(ref output, wordlist, labels, relatedFlag);
                Console.WriteLine("\nCount: {0}\n", parser.count);
            }
            else if ((useParser & ParserMask.Macmillan) != ParserMask.None)
            {
                var parser = new Macmillan();
                parser.ProcessWordlist(ref output, wordlist, labels, relatedFlag);
                Console.WriteLine("\nCount: {0}\n", parser.count);
            }
            else if ((useParser & ParserMask.VocabularyCom) != ParserMask.None)
            {
                var parser = new VocabularyCom();
                parser.ProcessWordlist(ref output, wordlist, labels, vocabularyComDomain, examplesLimit);
                Console.WriteLine("\nCount: {0}\n", parser.count);
            }
            else if ((useParser & ParserMask.LingvoRu) != ParserMask.None)
            {
                var parser = new LingvoRu();
                parser.ProcessWordlist(ref output, wordlist, labels, examplesLimit);
                Console.WriteLine("\nCount: {0}\n", parser.count);
            }

            if (useParser != ParserMask.None)
            {
                output.Save();
            }
        }
Пример #10
0
        public static void Main(string[] args)
        {
            ParserMask useParser = ParserMask.None;
            var inputPath = "";
            var outputPath = String.Empty;
            var wordlist = new List<String>();
            var labels = "";
            var relatedFlag = false;
            var examplesLimit = Int32.MaxValue;
            var vocabularyComDomain = String.Empty;

            CardsStream output;
            StreamReader input;

            for (int i = 0; i < args.Length; i++)
            {
                if (args[i] == "-oald8")
                {
                    outputPath = "oald8";
                    useParser |= ParserMask.Oald8;
                }
                else if (args[i] == "-macmillan")
                {
                    outputPath = "macmillan";
                    useParser |= ParserMask.Macmillan;
                }
                else if (args[i] == "-vocabcom")
                {
                    outputPath = "vocabulary.com";
                    useParser |= ParserMask.VocabularyCom;
                }
                else if (args[i] == "-lingvo")
                {
                    outputPath = "lingvo";
                    useParser |= ParserMask.LingvoRu;
                }

                else if (args[i] == "-l" && i + 1 < args.Length)
                {
                    labels = (args[i + 1]).Trim();
                    i++;
                }
                else if (args[i] == "-limit" && i + 1 < args.Length)
                {
                    examplesLimit = Convert.ToInt32((args[i + 1]).Trim());
                    i++;
                }

                else if (args[i] == "-vdomain" && i + 1 < args.Length)
                {
                    vocabularyComDomain = (args[i + 1]).Trim();
                    i++;
                }

                else if (args[i] == "-p" && i + 1 < args.Length)
                {
                    inputPath = (args[i + 1]).Trim();
                    i++;
                }

                else if (args[i] == "-related")
                {
                    relatedFlag = true;
                }
            }

            if (inputPath == "")
            {
                Console.Write("Where's a listname, uh? You should enter -p <path_to_you_wordlist>");
                Console.ReadKey();
                return;
            }
            else
            {
                try
                {
                    input = new StreamReader(inputPath);
                    while (input.EndOfStream == false)
                    {
                        var word = (new Regex("[^- 0-9a-zA-Z']+")).Replace(input.ReadLine(), "").Trim();
                        if (wordlist.Contains(word) == false)
                        {
                            wordlist.Add(word);
                        }
                    }
                    input.Close();
                }
                catch (FileNotFoundException e)
                {
                    Console.Write("Wrong path: {0}", inputPath);
                }
                catch (DirectoryNotFoundException e)
                {
                    Console.Write("Wrong directory: {0}", inputPath);
                }
            }

            if (labels.Length != 0)
            {
                outputPath = "./" + outputPath + " " + labels;
            }
            else
            {
                outputPath = "./" + outputPath;
            }

            outputPath += " " + DateTime.Now.ToString("yyyy.MM.dd HH-mm-ss") + ".txt";

            output = new CardsStream(outputPath, 100000);

            if ((useParser & ParserMask.Oald8) != ParserMask.None)
            {
                var parser = new Oald8();
                parser.ProcessWordlist(ref output, wordlist, labels, relatedFlag);
                Console.WriteLine("\nCount: {0}\n", parser.count);
            }
            else if ((useParser & ParserMask.Macmillan) != ParserMask.None)
            {
                var parser = new Macmillan();
                parser.ProcessWordlist(ref output, wordlist, labels, relatedFlag);
                Console.WriteLine("\nCount: {0}\n", parser.count);
            }
            else if ((useParser & ParserMask.VocabularyCom) != ParserMask.None)
            {
                var parser = new VocabularyCom();
                parser.ProcessWordlist(ref output, wordlist, labels, vocabularyComDomain, examplesLimit);
                Console.WriteLine("\nCount: {0}\n", parser.count);
            }
            else if ((useParser & ParserMask.LingvoRu) != ParserMask.None)
            {
                var parser = new LingvoRu();
                parser.ProcessWordlist(ref output, wordlist, labels, examplesLimit);
                Console.WriteLine("\nCount: {0}\n", parser.count);
            }

            if (useParser != ParserMask.None)
            {
                output.Save();
            }
        }
Пример #11
0
        public void ProcessWordlist(ref CardsStream stream, List <string> wordlist, string userLabels, string domain, int limit)
        {
            var step_ = (limit < 48) ? limit : 48;

            userLabels = "vocabulary_com " + userLabels;

            foreach (String word in wordlist)
            {
                var offset             = 0;
                var step               = step_;
                var wordLabel          = word.Replace(' ', '-');
                var primaryDefinitions = String.Empty;
                var fullDefinitions    = String.Empty;
                var page               = (new HtmlWeb()).Load(definitionsPath + word).DocumentNode;
                Console.WriteLine("{0}", word);

                //Primary
                var definitionsNodes = page.SelectNodes("//div[@class='def']");

                if (definitionsNodes != null)
                {
                    foreach (var p in definitionsNodes)
                    {
                        var href = "#s" + (new Regex(@"quickDef(\d+)")).Replace(p.Attributes["id"].Value, "$1");
                        primaryDefinitions += "<i><b>" + getText(page.SelectSingleNode("//a[@href='" + href + "']")) + "</i></b> " + p.InnerText + "<br/>";
                    }
                }

                definitionsNodes = page.SelectNodes("//div[@class='def selected']");

                if (definitionsNodes != null)
                {
                    foreach (var p in definitionsNodes)
                    {
                        var href = "#s" + (new Regex(@"quickDef(\d+)")).Replace(p.Attributes["id"].Value, "$1");
                        primaryDefinitions += "<i><b>" + getText(page.SelectSingleNode("//a[@href='" + href + "']")) + "</i></b> " + p.InnerText + "<br/>";
                    }
                }

                if (primaryDefinitions.Length == 0)
                {
                    definitionsNodes = page.SelectNodes("//h3[@class='definition']");
                    if (definitionsNodes != null)
                    {
                        foreach (var f in definitionsNodes)
                        {
                            var partOfSpeech      = f.SelectSingleNode("a");
                            var currentDefinition = new Regex("[\t\r\n]").Replace(getText(f), "");
                            currentDefinition   = (new Regex(@"^\w+\s+(.*?)$")).Replace(currentDefinition, "$1");
                            primaryDefinitions += "<i><b>" + getText(partOfSpeech) + "</i></b> " + currentDefinition + "<br/>";
                        }

                        primaryDefinitions = primaryDefinitions.Substring(0, primaryDefinitions.Length - 5);
                    }
                }

                do
                {
                    if (limit - offset < step)
                    {
                        step = limit - offset;
                    }

                    if (step <= 0)
                    {
                        break;
                    }

                    try
                    {
                        var json_ = System.Text.UTF8Encoding.ASCII.GetString(client.DownloadData(examplesPath + word + "&maxResults=" + step + "&startOffset=" + offset + "&filter=0&domain=" + domain));

                        json_ = (new Regex(@"\$d\((.*?)\)")).Replace(json_, @"""$1""");
                        var json = JObject.Parse(json_);

                        if (offset == 0)
                        {
                            var hits = (int)json.SelectToken("result.totalHits");

                            reportStream.Write("Success. Word: " + word + " Hits: " + hits + "\n");
                            Console.WriteLine("Hits: {0}", hits);

                            if (hits == 0)
                            {
                                reportStream.Write("Failure. Examples was not found. Word: " + word + "\n");
                            }
                        }

                        Console.WriteLine("Processed: {0}", offset);

                        offset += step;

                        foreach (var _ in json.SelectToken("result.sentences"))
                        {
                            count_++;
                            var example = new StringBuilder()
                                          .Append(SafeTrim((string)_.SelectToken("sentence")))
                                          .Append("%%!!%%")
                                          .Append(SafeTrim((string)_.SelectToken("sentence")))
                                          .Append("%%!!%%")
                                          .Append(word)
                                          .Append("%%!!%%")
                                          .Append(primaryDefinitions)
                                          .Append("%%!!%%")
                                          //.Append(fullDefinitions)
                                          //.Append("%%!!%%")
                                          .Append(wordLabel)
                                          .Append(" ")
                                          .Append(userLabels);

                            stream.Write((new Regex("[\t\n\r]").Replace(example.ToString(), "")).Replace("%%!!%%", "\t") + "\n");
                        }
                    }
                    catch
                    {
                        continue;
                    }
                }while (step > 0);

                Console.WriteLine("Processed: {0}\n", offset);
            }
            reportStream.Write("Total: " + count_ + "\n");
            reportStream.Save();
        }
Пример #12
0
        void ParsePage(ref CardsStream stream, HtmlDocument document, string link, string userLabels)
        {
            var examples = document.DocumentNode.SelectNodes("//span[@class='x-g']");

            if (examples == null)
            {
                reportStream.Write("Failure. Examples was not found. Link: " + link + "\n");
                return;
            }

            //Transcription
            var usaTranscription = document.DocumentNode.SelectSingleNode("//span[@class='y']");
            var gbrTranscription = document.DocumentNode.SelectSingleNode("//span[@class='i']");

            //Word "name"
            var word = getText(document.DocumentNode.SelectSingleNode("//h2[@class='h']"));

            //Label
            var wordLabel = word.Replace(' ', '-');
            userLabels = (userLabels == "") ? "oald8 " + wordLabel : "oald8 " + wordLabel + " " + userLabels;

            foreach (HtmlNode example in examples)
            {
                var card = new Card();
                var parentNode = example.ParentNode;

                //Getting a structure
                var structure1 = example.SelectSingleNode("span[@class='cf']");
                if (GetName(parentNode) == "pv-g")
                {
                    var structure2 = parentNode.SelectSingleNode("h4[@class='pv']");
                    if (structure2 != null) card.Structure.Add(structure2.InnerText);
                    if (structure1 != null) card.Structure.Add(structure1.InnerText);
                }
                else if (GetName(parentNode) == "n-g" && GetName(parentNode.ParentNode) == "pv-g")
                {
                    var structure2 = parentNode.SelectSingleNode("span[@class='vs-g']");
                    var structure3 = parentNode.ParentNode.SelectSingleNode("h4[@class='pv']");
                    if (structure3 != null) card.Structure.Add(structure3.InnerText);
                    if (structure2 != null) card.Structure.Add(structure2.InnerText);
                    if (structure1 != null) card.Structure.Add(structure1.InnerText);
                }
                else if (GetName(parentNode) == "n-g")
                {
                    var structure2 = parentNode.SelectSingleNode("span[@class='cf']");
                    if (structure2 != null) card.Structure.Add(structure2.InnerText);
                    if (structure1 != null) card.Structure.Add(structure1.InnerText);
                }
                else if (GetName(parentNode) == "id-g")
                {
                    var structure2 = parentNode.SelectSingleNode("h4[@class='id']");
                    if (structure2 != null) card.Structure.Add(structure2.InnerText);
                    if (structure1 != null) card.Structure.Add(structure1.InnerText);
                }
                else if (GetName(parentNode) == "h-g")
                {
                    var structure2 = parentNode.SelectSingleNode("span[@class='cf']");
                    if (structure2 != null) card.Structure.Add(structure2.InnerText);
                    if (structure1 != null) card.Structure.Add(structure1.InnerText);
                }

                //Getting a definition
                HtmlNode definition = null;

                if (GetName(parentNode) == "id-g" || GetName(parentNode) == "h-g")
                {
                    definition = parentNode.SelectSingleNode("div[@class='def_block']");
                }
                if (definition == null)
                {
                    definition = parentNode;
                }

                var temp = definition.SelectSingleNode("span[@class='ud']");
                if (temp == null)
                {
                    definition = definition.SelectSingleNode("span[@class='d']");
                }
                else {
                    definition = temp;
                }

                //An example itself
                card.Interpretation = getText(example.SelectSingleNode("span[@class='x']"));
                card.Sentence = (new Regex(" \\(=.*?\\)")).Replace(card.Interpretation, "");

                card.Definition = getText(definition);
                card.Definition = card.Definition.Replace("    ", " ");

                card.UsaTranscription = getText(usaTranscription);
                card.GbrTranscription = getText(gbrTranscription);

                if (card.Interpretation == card.Sentence)
                {
                    card.Interpretation = "";
                }
                if (card.Definition == "")
                {
                    reportStream.Write("Failure. Definition was not found. Link: " + link + " Example: '" + card.Sentence + "'\n");
                }
                if (card.Sentence == "")
                {
                    reportStream.Write("Failure. Example was not found. Link: " + link + "\n");
                }

                var outStr = card.Sentence + "\t" + card.Interpretation +
                    "\t" + word + "\t" + card.GbrTranscription +
                    "\t" + card.UsaTranscription + "\t" +
                    PrintList(card.Structure) + "\t" + card.Definition +
                    "\t" + userLabels + "\n";
                stream.Write(outStr);
                count_++;
            }
        }
Пример #13
0
        public void ProcessWordlist(ref CardsStream stream, List<string> wordlist, string labels, bool crossreferenceFlag)
        {
            //Разбираем исходные страницы. Возможно, удастся ускорить работу засчёт распараллеливания.
            //Собираем ссылки в теле статьи и добавляем в список.
            //downloadPages(ref stream, GetArticleCrossrefenceLinkList(), labels);

            //Работаем с ссылками, находящиеся в блоке Search Results, в т.ч. и на само word)
            var crossreferenceLinks = new Hashtable();
            string currentPageLink;

            foreach(String word in wordlist)
            {
                var page = (new HtmlWeb()).Load(searchPath + word);

                //Определяем, на какой по какой ссылке находится наше слово
                var currentPageLink_ = page.DocumentNode.SelectSingleNode("//li[@class='currentpage']/a");
                currentPageLink = (currentPageLink_ != null) ? getCleanUrl(currentPageLink_) : "";

                //Берём все слова из блока search results
                var linksNodes = page.DocumentNode.SelectNodes("//div[@id='relatedentries']/ul/li/a");
                if (linksNodes != null)
                {
                    foreach (HtmlNode link in linksNodes)
                    {
                        var url = getCleanUrl(link);
                        if (crossreferenceLinks.ContainsKey(url) == false)
                        {
                            crossreferenceLinks.Add(url, false);
                        }
                    }
                }
                else
                {
                    reportStream.Write("Failure. Page not found. Link: " + word + "\n");
                }

                //Всё нужное получили, начинаем обрабатывать searchCrossreferenceLinkList
                var updatedWordList = new Hashtable();
                foreach (string link in crossreferenceLinks.Keys)
                {
                    if ((bool)crossreferenceLinks[link] == true)
                    {
                        continue;
                    }

                    var tranformedWordRegExp = new Regex("^(" + word.Replace(' ', '-') + "_\\d+)$", RegexOptions.IgnoreCase);

                    //Первую страницу мы уже скачали.
                    if (currentPageLink == link)
                    {
                        ParsePage(ref stream, page, word, labels);
                    }
                    else if (tranformedWordRegExp.Match(link).Success)
                    {
                        ParsePage(ref stream, (new HtmlWeb()).Load(searchPath + link), link, labels);
                    }
                    else if (crossreferenceFlag == true)
                    {
                        ParsePage(ref stream, (new HtmlWeb()).Load(searchPath + link), link, labels);
                    }
                    if (currentPageLink == link || tranformedWordRegExp.Match(link).Success || crossreferenceFlag == true)
                    {
                        Console.WriteLine("{0}", link);
                        updatedWordList.Add(link, true);
                        reportStream.Write("Success. Page was parsed. Link: " + link + "\n");
                    }
                }

                foreach (DictionaryEntry update in updatedWordList)
                {
                    crossreferenceLinks[update.Key] = update.Value;
                }
            }
            reportStream.Write("Total: " + count_ + "\n");
            reportStream.Save();
        }
Пример #14
0
        public void ProcessWordlist(ref CardsStream stream, List <string> wordlist, string labels, bool crossreferenceFlag)
        {
            //Разбираем исходные страницы. Возможно, удастся ускорить работу засчёт распараллеливания.
            //Собираем ссылки в теле статьи и добавляем в список.
            //downloadPages(ref stream, GetArticleCrossrefenceLinkList(), labels);

            //Работаем с ссылками, находящиеся в блоке Search Results, в т.ч. и на само word)
            var    crossreferenceLinks = new Hashtable();
            string currentPageLink;

            foreach (String word in wordlist)
            {
                var page = (new HtmlWeb()).Load(searchPath + word);

                //Определяем, на какой по какой ссылке находится наше слово
                var currentPageLink_ = page.DocumentNode.SelectSingleNode("//li[@class='currentpage']/a");
                currentPageLink = (currentPageLink_ != null) ? getCleanUrl(currentPageLink_) : "";

                //Берём все слова из блока search results
                var linksNodes = page.DocumentNode.SelectNodes("//div[@id='relatedentries']/ul/li/a");
                if (linksNodes != null)
                {
                    foreach (HtmlNode link in linksNodes)
                    {
                        var url = getCleanUrl(link);
                        if (crossreferenceLinks.ContainsKey(url) == false)
                        {
                            crossreferenceLinks.Add(url, false);
                        }
                    }
                }
                else
                {
                    reportStream.Write("Failure. Page not found. Link: " + word + "\n");
                }

                //Всё нужное получили, начинаем обрабатывать searchCrossreferenceLinkList
                var updatedWordList = new Hashtable();
                foreach (string link in crossreferenceLinks.Keys)
                {
                    if ((bool)crossreferenceLinks[link] == true)
                    {
                        continue;
                    }

                    var tranformedWordRegExp = new Regex("^(" + word.Replace(' ', '-') + "_\\d+)$", RegexOptions.IgnoreCase);

                    //Первую страницу мы уже скачали.
                    if (currentPageLink == link)
                    {
                        ParsePage(ref stream, page, word, labels);
                    }
                    else if (tranformedWordRegExp.Match(link).Success)
                    {
                        ParsePage(ref stream, (new HtmlWeb()).Load(searchPath + link), link, labels);
                    }
                    else if (crossreferenceFlag == true)
                    {
                        ParsePage(ref stream, (new HtmlWeb()).Load(searchPath + link), link, labels);
                    }
                    if (currentPageLink == link || tranformedWordRegExp.Match(link).Success || crossreferenceFlag == true)
                    {
                        Console.WriteLine("{0}", link);
                        updatedWordList.Add(link, true);
                        reportStream.Write("Success. Page was parsed. Link: " + link + "\n");
                    }
                }

                foreach (DictionaryEntry update in updatedWordList)
                {
                    crossreferenceLinks[update.Key] = update.Value;
                }
            }
            reportStream.Write("Total: " + count_ + "\n");
            reportStream.Save();
        }