示例#1
0
        public static HashSet <String> DataTableToHashSet(DataTable dt)
        {
            HashSet <String> hs = new HashSet <string>();

            foreach (DataRow dr in dt.Rows)
            {
                hs.Add(QuotedPrintable.EncodeQuotedPrintable(dr[0].ToString().Trim()));
            }
            return(hs);
        }
示例#2
0
        public static void PrintCategogy(StreamWriter sw, String page, String source, Boolean pageFlag)
        {
            Boolean first = pageFlag;

            foreach (String key in Common.keys.Keys)
            {
                if (source.Contains(":" + key) && first)
                {
                    sw.WriteLine(page + "\t" + QuotedPrintable.EncodeQuotedPrintable(page) + "\t" + key);
                    first = false;
                    Common.keysPower[key] = 1;
                    continue;
                }

                if (source.Contains(":" + key))
                {
                    Common.keysPower[key]++;
                }
            }
        }
示例#3
0
        static public List <List <String> > ReturnWikiCategory(SortedDictionary <String, Int64> keys)
        {
            List <List <String> > buf = new List <List <String> >();
            var    document           = new HtmlDocument();
            var    client             = new WebClient();
            bool   over       = true;
            String baseString = "https://ru.wikipedia.org";
            //foreach (Char c in Common.alpha)
            String tail = "/w/index.php?title=Служебная:Все_страницы&from=%28hed%29+P.E.&namespace=14";

            do
            {
                var stream = client.OpenRead(baseString + tail);

                var reader = new StreamReader(stream, Encoding.GetEncoding("UTF-8"));
                var html   = reader.ReadToEnd();
                document.LoadHtml(html);


                String             tags  = "//li";
                HtmlNodeCollection nodes = document.DocumentNode.SelectNodes(tags);


                foreach (HtmlNode node in nodes)
                {
                    if (!node.InnerText.Contains("Категория:"))
                    {
                        break;
                    }
                    String value = node.InnerText.Substring("Категория:".Length).ToLower();


                    foreach (String key in keys.Keys)
                    {
                        if (value.Contains(key.ToLower()))
                        {
                            List <String> pair = new List <string>();
                            pair.Add(key);
                            pair.Add(node.InnerText.Substring("Категория:".Length));
                            pair.Add("14");
                            buf.Add(pair);
                        }
                    }
                }

                tags  = "//div[@class='mw-allpages-nav']";
                nodes = document.DocumentNode.SelectNodes(tags);


                foreach (HtmlNode node in nodes)
                {
                    String val = node.InnerHtml;
                    if (node.InnerText.Contains("|"))
                    {
                        val = val.Substring(val.IndexOf("|"));
                    }
                    if (node.InnerText.Contains("Следующая страница"))
                    {
                        int firstpos = val.IndexOf("\"");
                        int lastpos  = val.Substring(firstpos + 1).IndexOf("\"");
                        tail = val.Substring(firstpos + 1, lastpos);
                        tail = QuotedPrintable.DecodeQuotedPrintable(tail, "Привет");
                        tail = tail.Replace("amp;", "");
                        goto jump;
                    }
                }
                over = false;
                jump :;
            } while (over);

            return(buf);
        }
示例#4
0
        static public void ParseXMLwikiForEdits(String file)
        {
            XmlReader XMLreader = XmlReader.Create(new StreamReader(Common.wikixmlFile, Encoding.UTF8));

            PostGrePlugIn.openConnection();
            Console.OutputEncoding = Encoding.Unicode;

            StreamWriter editsStream   = new StreamWriter(Common.editsFile, false, Encoding.UTF8);
            StreamWriter editorsStream = new StreamWriter(Common.editorsFile, false, Encoding.UTF8);

            editsStream.Close();
            editorsStream.Close();

            List <String> editsStore = new List <string>();
            Dictionary <String, List <Object> > editorsStore = new Dictionary <string, List <Object> >();
            Int64 editsCounter   = 0;
            Int64 editorsCounter = 0;

            Boolean flagReadTitle    = false;
            Boolean flagReadBody     = false;
            Boolean flagReadPage     = false;
            String  pageName         = "";
            String  oldPage          = "";
            Boolean pageFlag         = false;
            Boolean readRevisionMode = false;
            String  editorID         = "";
            String  comment          = "";
            String  pageID           = "";


            try
            {
                while (XMLreader.Read())
                {
                    if (XMLreader.Name == "title" && (XMLreader.NodeType == XmlNodeType.Element))
                    {
                        flagReadTitle = true;
                        continue;
                    }

                    if (XMLreader.Name == "title" && (XMLreader.NodeType == XmlNodeType.EndElement))
                    {
                        flagReadTitle = false;
                        continue;
                    }

                    if (XMLreader.Name == "page" && (XMLreader.NodeType == XmlNodeType.Element))
                    {
                        flagReadPage = true;
                        continue;
                    }

                    if (XMLreader.Name == "page" && (XMLreader.NodeType == XmlNodeType.EndElement))
                    {
                        flagReadPage     = false;
                        readRevisionMode = false;
                        continue;
                    }


                    if (XMLreader.Name == "text" && (XMLreader.NodeType == XmlNodeType.Element))
                    {
                        flagReadBody = true;
                        continue;
                    }


                    if (XMLreader.Name == "text" && (XMLreader.NodeType == XmlNodeType.EndElement))
                    {
                        flagReadBody = false;
                        continue;
                    }

                    if (XMLreader.NodeType == XmlNodeType.Text && flagReadTitle)
                    {
                        pageName = XMLreader.Value;
                        if (Common.interestPages.Contains(QuotedPrintable.EncodeQuotedPrintable(pageName)))
                        {
                            readRevisionMode = true;
                        }
                        while (XMLreader.Name != "id")
                        {
                            XMLreader.Read();
                        }
                        XMLreader.Read();
                        pageID        = XMLreader.Value;
                        flagReadTitle = false;
                    }



                    if (XMLreader.Name == "revision" && XMLreader.NodeType == XmlNodeType.Element)
                    {
                        String   timeString = "";
                        DateTime day        = new DateTime();
                        String   ip         = "NA";
                        String   country    = "NA";
                        Byte     troll      = 0;
                        Int32    typetroll  = -1;
                        String   name       = "NA";
                        Int32    nameID     = -1;
                        Double   distance   = -1;
                        Byte     inGroup    = 0;

                        if (readRevisionMode)
                        {
                            inGroup = 1;
                        }

                        while (!(XMLreader.NodeType == XmlNodeType.EndElement && XMLreader.Name == "revision"))
                        {
                            XMLreader.Read();
                            if (XMLreader.NodeType == XmlNodeType.Element && XMLreader.Name == "timestamp")
                            {
                                XMLreader.Read();
                                timeString = XMLreader.Value;
                                day        = DateTime.Parse(timeString);
                                XMLreader.Read();
                            }

                            if (XMLreader.NodeType == XmlNodeType.Element && XMLreader.Name == "contributor")
                            {
                                while (XMLreader.Name != "ip" && XMLreader.Name != "username")
                                {
                                    XMLreader.Read();
                                }

                                if (XMLreader.Name == "ip")
                                {
                                    XMLreader.Read();
                                    ip      = editorID = XMLreader.Value;
                                    country = PostGrePlugIn.DataTableToList(PostGrePlugIn.getTablePostGre(PostGrePlugIn.ReturnIpQuery("countryipranges", ip, "country")))[0];
                                    PostGrePlugIn.GetTrollResults(PostGrePlugIn.getTablePostGre(PostGrePlugIn.ReturnIpQuery("troll_bases", ip, " c1,c2,c3,c4, distance ")), out troll, out distance, out typetroll);
                                }
                                else
                                {
                                    XMLreader.Read();
                                    name = XMLreader.Value.Replace(@"\", "_slash_");

                                    while (XMLreader.Name != "id")
                                    {
                                        XMLreader.Read();
                                    }
                                    XMLreader.Read();

                                    nameID   = Int32.Parse(XMLreader.Value);
                                    editorID = nameID.ToString();
                                }

                                while (XMLreader.Name != "contributor")
                                {
                                    XMLreader.Read();
                                }
                                XMLreader.Read();
                                XMLreader.Read();
                                if (XMLreader.Name == "comment")
                                {
                                    XMLreader.Read();
                                    comment = XMLreader.Value.Replace('\n', ' ');
                                    comment = comment.Replace(@"\", " ");
                                    comment = comment.Replace('\t', ' ');
                                }
                                else
                                {
                                    comment = "";
                                }

                                break;
                            }
                        }



                        if (!editorsStore.ContainsKey(editorID))
                        {
                            List <object> list = new List <object>();
                            list.Add(1);
                            list.Add(1 * inGroup);
                            name = name.Replace(@"\", "_slash_");
                            list.Add(ip + "\t" + name + "\t" + nameID + "\t" + country + "\t" + troll.ToString() + "\t" + typetroll.ToString() + "\t" + distance.ToString());
                            editorsStore.Add(editorID, list);
                        }
                        else
                        {
                            editorsStore[editorID][0] = Int32.Parse(editorsStore[editorID][0].ToString()) + 1;
                            editorsStore[editorID][1] = Int32.Parse(editorsStore[editorID][0].ToString()) * inGroup + 1;
                        }
                        if (distance > 0)
                        {
                            ;
                        }
                        editsStore.Add(pageName + "\t" + pageID + "\t" + inGroup.ToString() + "\t" + CleanExactTime(timeString) + "\t" + day.ToString("yyyy-MM-dd") + "\t" + ip + "\t" + name + "\t" + nameID + "\t" + country + "\t" + troll.ToString() + "\t" + typetroll.ToString() + "\t" + distance.ToString() + "\t" + comment);
                        if (editsStore.Count > 10000)
                        {
                            editorsStore = ReleaseFile(Common.editorsFile, editorsStore);
                        }
                        if (editsStore.Count > 10000)
                        {
                            editsStore = ReleaseFile(Common.editsFile, editsStore);
                        }
                    }
                }
            }

            catch (XmlException ex)
            {
                Console.WriteLine("Ended abnormally..." + ex);
            }
            eend :;
        }