Save() public méthode

public Save ( ) : void
Résultat void
Exemple #1
0
        public void ProcessWordlist(ref CardsStream stream, List <string> wordlist, string labels, int limit)
        {
            labels = (labels == "") ? "lingvo " + labels : "lingvo " + labels + " ";

            foreach (var word in wordlist)
            {
                var offset = 0;

                reportStream.Write("Success. Page was parsed. Word: " + word + "\n");
                Console.WriteLine("{0}", word);

                while (offset < limit)
                {
                    var page = new HtmlDocument();
                    page.LoadHtml(DownloadExamples(word, offset));

                    var examples = page.DocumentNode.SelectNodes("//tr[@class='item first']");
                    if (examples == null)
                    {
                        reportStream.Write("Failure. Examples was not found. Word: " + word + "\n");
                        break;
                    }

                    stream.Write(Parse(examples, labels + word));
                    count_ += examples.Count;

                    examples = page.DocumentNode.SelectNodes("//tr[@class='item']");
                    if (examples == null)
                    {
                        break;
                    }

                    stream.Write(Parse(examples, labels + word));
                    count_ += examples.Count;

                    offset += 5;
                }
            }

            reportStream.Write("Total: " + count_ + "\n");
            reportStream.Save();
            return;
        }
Exemple #2
0
        public void ProcessWordlist(ref CardsStream stream, List <string> wordlist, string labels, bool crossreferenceFlag)
        {
            //Разбираем исходные страницы. Возможно, удастся ускорить работу засчёт распараллеливания.
            //Собираем ссылки в теле статьи и добавляем в список.
            //downloadPages(ref stream, GetArticleCrossrefenceLinkList(), labels);

            //Работаем с ссылками, находящиеся в блоке Search Results, в т.ч. и на само word)
            var crossreferenceLinks = new Hashtable();

            foreach (String word in wordlist)
            {
                var page = (new HtmlWeb()).Load(searchPath + word);

                ParsePage(ref stream, page, word, labels);
                Console.WriteLine("{0}", word);
                reportStream.Write("Success. Page was parsed. Link: " + word + "\n");

                //Берём все слова из блока search results
                var linksNodes = page.DocumentNode.SelectNodes("//div[@class='entrylist']/ul/li/a");
                if (linksNodes != null)
                {
                    foreach (HtmlNode link in linksNodes)
                    {
                        var url = getCleanUrl(link);
                        if (crossreferenceLinks.ContainsKey(url) == false)
                        {
                            crossreferenceLinks.Add(url, false);
                        }
                    }
                }


                var updatedWordList = new Hashtable();

                foreach (string link in crossreferenceLinks.Keys)
                {
                    if ((bool)crossreferenceLinks[link] == true)
                    {
                        continue;
                    }

                    var tranformedWordRegExp = new Regex("^(" + word.Replace(' ', '-') + "_\\d+)$", RegexOptions.IgnoreCase);

                    if (crossreferenceFlag == true || tranformedWordRegExp.Match(link).Success)
                    {
                        ParsePage(ref stream, (new HtmlWeb()).Load(searchPath + link), link, labels);

                        Console.WriteLine("{0}", link);
                        updatedWordList.Add(link, true);
                        reportStream.Write("Success. Page was parsed. Link: " + link + "\n");
                    }
                }

                foreach (DictionaryEntry update in updatedWordList)
                {
                    crossreferenceLinks[update.Key] = update.Value;
                }
            }
            reportStream.Write("Total: " + count_ + "\n");
            reportStream.Save();
        }
Exemple #3
0
        public static void Main(string[] args)
        {
            ParserMask useParser           = ParserMask.None;
            var        inputPath           = "";
            var        outputPath          = String.Empty;
            var        wordlist            = new List <String>();
            var        labels              = "";
            var        relatedFlag         = false;
            var        examplesLimit       = Int32.MaxValue;
            var        vocabularyComDomain = String.Empty;

            CardsStream  output;
            StreamReader input;

            for (int i = 0; i < args.Length; i++)
            {
                if (args[i] == "-oald8")
                {
                    outputPath = "oald8";
                    useParser |= ParserMask.Oald8;
                }
                else if (args[i] == "-macmillan")
                {
                    outputPath = "macmillan";
                    useParser |= ParserMask.Macmillan;
                }
                else if (args[i] == "-vocabcom")
                {
                    outputPath = "vocabulary.com";
                    useParser |= ParserMask.VocabularyCom;
                }
                else if (args[i] == "-lingvo")
                {
                    outputPath = "lingvo";
                    useParser |= ParserMask.LingvoRu;
                }

                else if (args[i] == "-l" && i + 1 < args.Length)
                {
                    labels = (args[i + 1]).Trim();
                    i++;
                }
                else if (args[i] == "-limit" && i + 1 < args.Length)
                {
                    examplesLimit = Convert.ToInt32((args[i + 1]).Trim());
                    i++;
                }

                else if (args[i] == "-vdomain" && i + 1 < args.Length)
                {
                    vocabularyComDomain = (args[i + 1]).Trim();
                    i++;
                }

                else if (args[i] == "-p" && i + 1 < args.Length)
                {
                    inputPath = (args[i + 1]).Trim();
                    i++;
                }

                else if (args[i] == "-related")
                {
                    relatedFlag = true;
                }
            }

            if (inputPath == "")
            {
                Console.Write("Where's a listname, uh? You should enter -p <path_to_you_wordlist>");
                Console.ReadKey();
                return;
            }
            else
            {
                try
                {
                    input = new StreamReader(inputPath);
                    while (input.EndOfStream == false)
                    {
                        //
                        var word = (new Regex("[^- 0-9a-zA-Z']+")).Replace(input.ReadLine(), "").Trim();
                        //
                        if (wordlist.Contains(word) == false)
                        {
                            wordlist.Add(word);
                        }
                    }
                    input.Close();
                }
                catch (FileNotFoundException e)
                {
                    Console.Write("Wrong path: {0}", inputPath);
                }
                catch (DirectoryNotFoundException e)
                {
                    Console.Write("Wrong directory: {0}", inputPath);
                }
            }


            if (labels.Length != 0)
            {
                outputPath = "./" + outputPath + " " + labels;
            }
            else
            {
                outputPath = "./" + outputPath;
            }

            outputPath += " " + DateTime.Now.ToString("yyyy.MM.dd HH-mm-ss") + ".txt";

            output = new CardsStream(outputPath, 100000);

            if ((useParser & ParserMask.Oald8) != ParserMask.None)
            {
                var parser = new Oald8();
                parser.ProcessWordlist(ref output, wordlist, labels, relatedFlag);
                Console.WriteLine("\nCount: {0}\n", parser.count);
            }
            else if ((useParser & ParserMask.Macmillan) != ParserMask.None)
            {
                var parser = new Macmillan();
                parser.ProcessWordlist(ref output, wordlist, labels, relatedFlag);
                Console.WriteLine("\nCount: {0}\n", parser.count);
            }
            else if ((useParser & ParserMask.VocabularyCom) != ParserMask.None)
            {
                var parser = new VocabularyCom();
                parser.ProcessWordlist(ref output, wordlist, labels, vocabularyComDomain, examplesLimit);
                Console.WriteLine("\nCount: {0}\n", parser.count);
            }
            else if ((useParser & ParserMask.LingvoRu) != ParserMask.None)
            {
                var parser = new LingvoRu();
                parser.ProcessWordlist(ref output, wordlist, labels, examplesLimit);
                Console.WriteLine("\nCount: {0}\n", parser.count);
            }

            if (useParser != ParserMask.None)
            {
                output.Save();
            }
        }
        public static void Main(string[] args)
        {
            ParserMask useParser = ParserMask.None;
            var inputPath = "";
            var outputPath = String.Empty;
            var wordlist = new List<String>();
            var labels = "";
            var relatedFlag = false;
            var examplesLimit = Int32.MaxValue;
            var vocabularyComDomain = String.Empty;

            CardsStream output;
            StreamReader input;

            for (int i = 0; i < args.Length; i++)
            {
                if (args[i] == "-oald8")
                {
                    outputPath = "oald8";
                    useParser |= ParserMask.Oald8;
                }
                else if (args[i] == "-macmillan")
                {
                    outputPath = "macmillan";
                    useParser |= ParserMask.Macmillan;
                }
                else if (args[i] == "-vocabcom")
                {
                    outputPath = "vocabulary.com";
                    useParser |= ParserMask.VocabularyCom;
                }
                else if (args[i] == "-lingvo")
                {
                    outputPath = "lingvo";
                    useParser |= ParserMask.LingvoRu;
                }

                else if (args[i] == "-l" && i + 1 < args.Length)
                {
                    labels = (args[i + 1]).Trim();
                    i++;
                }
                else if (args[i] == "-limit" && i + 1 < args.Length)
                {
                    examplesLimit = Convert.ToInt32((args[i + 1]).Trim());
                    i++;
                }

                else if (args[i] == "-vdomain" && i + 1 < args.Length)
                {
                    vocabularyComDomain = (args[i + 1]).Trim();
                    i++;
                }

                else if (args[i] == "-p" && i + 1 < args.Length)
                {
                    inputPath = (args[i + 1]).Trim();
                    i++;
                }

                else if (args[i] == "-related")
                {
                    relatedFlag = true;
                }
            }

            if (inputPath == "")
            {
                Console.Write("Where's a listname, uh? You should enter -p <path_to_you_wordlist>");
                Console.ReadKey();
                return;
            }
            else
            {
                try
                {
                    input = new StreamReader(inputPath);
                    while (input.EndOfStream == false)
                    {
                        var word = (new Regex("[^- 0-9a-zA-Z']+")).Replace(input.ReadLine(), "").Trim();
                        if (wordlist.Contains(word) == false)
                        {
                            wordlist.Add(word);
                        }
                    }
                    input.Close();
                }
                catch (FileNotFoundException e)
                {
                    Console.Write("Wrong path: {0}", inputPath);
                }
                catch (DirectoryNotFoundException e)
                {
                    Console.Write("Wrong directory: {0}", inputPath);
                }
            }

            if (labels.Length != 0)
            {
                outputPath = "./" + outputPath + " " + labels;
            }
            else
            {
                outputPath = "./" + outputPath;
            }

            outputPath += " " + DateTime.Now.ToString("yyyy.MM.dd HH-mm-ss") + ".txt";

            output = new CardsStream(outputPath, 100000);

            if ((useParser & ParserMask.Oald8) != ParserMask.None)
            {
                var parser = new Oald8();
                parser.ProcessWordlist(ref output, wordlist, labels, relatedFlag);
                Console.WriteLine("\nCount: {0}\n", parser.count);
            }
            else if ((useParser & ParserMask.Macmillan) != ParserMask.None)
            {
                var parser = new Macmillan();
                parser.ProcessWordlist(ref output, wordlist, labels, relatedFlag);
                Console.WriteLine("\nCount: {0}\n", parser.count);
            }
            else if ((useParser & ParserMask.VocabularyCom) != ParserMask.None)
            {
                var parser = new VocabularyCom();
                parser.ProcessWordlist(ref output, wordlist, labels, vocabularyComDomain, examplesLimit);
                Console.WriteLine("\nCount: {0}\n", parser.count);
            }
            else if ((useParser & ParserMask.LingvoRu) != ParserMask.None)
            {
                var parser = new LingvoRu();
                parser.ProcessWordlist(ref output, wordlist, labels, examplesLimit);
                Console.WriteLine("\nCount: {0}\n", parser.count);
            }

            if (useParser != ParserMask.None)
            {
                output.Save();
            }
        }
Exemple #5
0
        public void ProcessWordlist(ref CardsStream stream, List <string> wordlist, string userLabels, string domain, int limit)
        {
            var step_ = (limit < 48) ? limit : 48;

            userLabels = "vocabulary_com " + userLabels;

            foreach (String word in wordlist)
            {
                var offset             = 0;
                var step               = step_;
                var wordLabel          = word.Replace(' ', '-');
                var primaryDefinitions = String.Empty;
                var fullDefinitions    = String.Empty;
                var page               = (new HtmlWeb()).Load(definitionsPath + word).DocumentNode;
                Console.WriteLine("{0}", word);

                //Primary
                var definitionsNodes = page.SelectNodes("//div[@class='def']");

                if (definitionsNodes != null)
                {
                    foreach (var p in definitionsNodes)
                    {
                        var href = "#s" + (new Regex(@"quickDef(\d+)")).Replace(p.Attributes["id"].Value, "$1");
                        primaryDefinitions += "<i><b>" + getText(page.SelectSingleNode("//a[@href='" + href + "']")) + "</i></b> " + p.InnerText + "<br/>";
                    }
                }

                definitionsNodes = page.SelectNodes("//div[@class='def selected']");

                if (definitionsNodes != null)
                {
                    foreach (var p in definitionsNodes)
                    {
                        var href = "#s" + (new Regex(@"quickDef(\d+)")).Replace(p.Attributes["id"].Value, "$1");
                        primaryDefinitions += "<i><b>" + getText(page.SelectSingleNode("//a[@href='" + href + "']")) + "</i></b> " + p.InnerText + "<br/>";
                    }
                }

                if (primaryDefinitions.Length == 0)
                {
                    definitionsNodes = page.SelectNodes("//h3[@class='definition']");
                    if (definitionsNodes != null)
                    {
                        foreach (var f in definitionsNodes)
                        {
                            var partOfSpeech      = f.SelectSingleNode("a");
                            var currentDefinition = new Regex("[\t\r\n]").Replace(getText(f), "");
                            currentDefinition   = (new Regex(@"^\w+\s+(.*?)$")).Replace(currentDefinition, "$1");
                            primaryDefinitions += "<i><b>" + getText(partOfSpeech) + "</i></b> " + currentDefinition + "<br/>";
                        }

                        primaryDefinitions = primaryDefinitions.Substring(0, primaryDefinitions.Length - 5);
                    }
                }

                do
                {
                    if (limit - offset < step)
                    {
                        step = limit - offset;
                    }

                    if (step <= 0)
                    {
                        break;
                    }

                    try
                    {
                        var json_ = System.Text.UTF8Encoding.ASCII.GetString(client.DownloadData(examplesPath + word + "&maxResults=" + step + "&startOffset=" + offset + "&filter=0&domain=" + domain));

                        json_ = (new Regex(@"\$d\((.*?)\)")).Replace(json_, @"""$1""");
                        var json = JObject.Parse(json_);

                        if (offset == 0)
                        {
                            var hits = (int)json.SelectToken("result.totalHits");

                            reportStream.Write("Success. Word: " + word + " Hits: " + hits + "\n");
                            Console.WriteLine("Hits: {0}", hits);

                            if (hits == 0)
                            {
                                reportStream.Write("Failure. Examples was not found. Word: " + word + "\n");
                            }
                        }

                        Console.WriteLine("Processed: {0}", offset);

                        offset += step;

                        foreach (var _ in json.SelectToken("result.sentences"))
                        {
                            count_++;
                            var example = new StringBuilder()
                                          .Append(SafeTrim((string)_.SelectToken("sentence")))
                                          .Append("%%!!%%")
                                          .Append(SafeTrim((string)_.SelectToken("sentence")))
                                          .Append("%%!!%%")
                                          .Append(word)
                                          .Append("%%!!%%")
                                          .Append(primaryDefinitions)
                                          .Append("%%!!%%")
                                          //.Append(fullDefinitions)
                                          //.Append("%%!!%%")
                                          .Append(wordLabel)
                                          .Append(" ")
                                          .Append(userLabels);

                            stream.Write((new Regex("[\t\n\r]").Replace(example.ToString(), "")).Replace("%%!!%%", "\t") + "\n");
                        }
                    }
                    catch
                    {
                        continue;
                    }
                }while (step > 0);

                Console.WriteLine("Processed: {0}\n", offset);
            }
            reportStream.Write("Total: " + count_ + "\n");
            reportStream.Save();
        }
        public void ProcessWordlist(ref CardsStream stream, List <string> wordlist, string labels, bool crossreferenceFlag)
        {
            //Разбираем исходные страницы. Возможно, удастся ускорить работу засчёт распараллеливания.
            //Собираем ссылки в теле статьи и добавляем в список.
            //downloadPages(ref stream, GetArticleCrossrefenceLinkList(), labels);

            //Работаем с ссылками, находящиеся в блоке Search Results, в т.ч. и на само word)
            var    crossreferenceLinks = new Hashtable();
            string currentPageLink;

            foreach (String word in wordlist)
            {
                var page = (new HtmlWeb()).Load(searchPath + word);

                //Определяем, на какой по какой ссылке находится наше слово
                var currentPageLink_ = page.DocumentNode.SelectSingleNode("//li[@class='currentpage']/a");
                currentPageLink = (currentPageLink_ != null) ? getCleanUrl(currentPageLink_) : "";

                //Берём все слова из блока search results
                var linksNodes = page.DocumentNode.SelectNodes("//div[@id='relatedentries']/ul/li/a");
                if (linksNodes != null)
                {
                    foreach (HtmlNode link in linksNodes)
                    {
                        var url = getCleanUrl(link);
                        if (crossreferenceLinks.ContainsKey(url) == false)
                        {
                            crossreferenceLinks.Add(url, false);
                        }
                    }
                }
                else
                {
                    reportStream.Write("Failure. Page not found. Link: " + word + "\n");
                }

                //Всё нужное получили, начинаем обрабатывать searchCrossreferenceLinkList
                var updatedWordList = new Hashtable();
                foreach (string link in crossreferenceLinks.Keys)
                {
                    if ((bool)crossreferenceLinks[link] == true)
                    {
                        continue;
                    }

                    var tranformedWordRegExp = new Regex("^(" + word.Replace(' ', '-') + "_\\d+)$", RegexOptions.IgnoreCase);

                    //Первую страницу мы уже скачали.
                    if (currentPageLink == link)
                    {
                        ParsePage(ref stream, page, word, labels);
                    }
                    else if (tranformedWordRegExp.Match(link).Success)
                    {
                        ParsePage(ref stream, (new HtmlWeb()).Load(searchPath + link), link, labels);
                    }
                    else if (crossreferenceFlag == true)
                    {
                        ParsePage(ref stream, (new HtmlWeb()).Load(searchPath + link), link, labels);
                    }
                    if (currentPageLink == link || tranformedWordRegExp.Match(link).Success || crossreferenceFlag == true)
                    {
                        Console.WriteLine("{0}", link);
                        updatedWordList.Add(link, true);
                        reportStream.Write("Success. Page was parsed. Link: " + link + "\n");
                    }
                }

                foreach (DictionaryEntry update in updatedWordList)
                {
                    crossreferenceLinks[update.Key] = update.Value;
                }
            }
            reportStream.Write("Total: " + count_ + "\n");
            reportStream.Save();
        }