コード例 #1
0
        //Create a table with post-soviet ranges to push to PostGreqsql later

        static public void DecodePageNames(String f1, String f2)
        {
            StreamReader sr = new StreamReader(f1, Encoding.UTF8);
            StreamWriter sw = new StreamWriter(f2, false);
            Dictionary <String, String> mapping = new Dictionary <string, string>();

            String str = "";

            while ((str = sr.ReadLine()) != null)
            {
                if (str == "")
                {
                    continue;
                }
                String[] items = str.Split('\t');

                if (!mapping.ContainsKey(items[1]))
                {
                    String import = QuotedPrintable.DecodeQuotedPrintable(items[1], "UTF-8");
                    import = QuotedPrintable.DecodeQuotedPrintable(import, "UTF-8");
                    mapping.Add(items[1], QuotedPrintable.DecodeQuotedPrintable(import, "UTF-8"));
                }
                items[1] = mapping[items[1]];
                str      = "";
                foreach (String s in items)
                {
                    str += s + "\t";
                }
                str = str.Substring(0, str.Length - 1);
                sw.WriteLine(str);
            }

            sr.Close();
            sw.Close();
        }
コード例 #2
0
ファイル: HTMLDigger.cs プロジェクト: Lenchickk/WikiDigger
        static public List <List <String> > ReturnWikiCategory(SortedDictionary <String, Int64> keys)
        {
            List <List <String> > buf = new List <List <String> >();
            var    document           = new HtmlDocument();
            var    client             = new WebClient();
            bool   over       = true;
            String baseString = "https://ru.wikipedia.org";
            //foreach (Char c in Common.alpha)
            String tail = "/w/index.php?title=Служебная:Все_страницы&from=%28hed%29+P.E.&namespace=14";

            do
            {
                var stream = client.OpenRead(baseString + tail);

                var reader = new StreamReader(stream, Encoding.GetEncoding("UTF-8"));
                var html   = reader.ReadToEnd();
                document.LoadHtml(html);


                String             tags  = "//li";
                HtmlNodeCollection nodes = document.DocumentNode.SelectNodes(tags);


                foreach (HtmlNode node in nodes)
                {
                    if (!node.InnerText.Contains("Категория:"))
                    {
                        break;
                    }
                    String value = node.InnerText.Substring("Категория:".Length).ToLower();


                    foreach (String key in keys.Keys)
                    {
                        if (value.Contains(key.ToLower()))
                        {
                            List <String> pair = new List <string>();
                            pair.Add(key);
                            pair.Add(node.InnerText.Substring("Категория:".Length));
                            pair.Add("14");
                            buf.Add(pair);
                        }
                    }
                }

                tags  = "//div[@class='mw-allpages-nav']";
                nodes = document.DocumentNode.SelectNodes(tags);


                foreach (HtmlNode node in nodes)
                {
                    String val = node.InnerHtml;
                    if (node.InnerText.Contains("|"))
                    {
                        val = val.Substring(val.IndexOf("|"));
                    }
                    if (node.InnerText.Contains("Следующая страница"))
                    {
                        int firstpos = val.IndexOf("\"");
                        int lastpos  = val.Substring(firstpos + 1).IndexOf("\"");
                        tail = val.Substring(firstpos + 1, lastpos);
                        tail = QuotedPrintable.DecodeQuotedPrintable(tail, "Привет");
                        tail = tail.Replace("amp;", "");
                        goto jump;
                    }
                }
                over = false;
                jump :;
            } while (over);

            return(buf);
        }