Exemple #1
0
        //extact all pages for a set of categories from Russian Wikipedia
        //table - categories

        static public void PutToFileAllPagesForCategoriesRU()
        {
            SortedDictionary <String, String> categories = PostGrePlugIn.getTableCategoriesAsDictionary();
            List <List <String> >             pages      = HTMLDigger.ReturnWikiPagesForCategories(categories);

            ObjectToSource.ComplexListToSource <String>(Common.pagesFilonCategory, pages);
        }
Exemple #2
0
        //extact all pages for a set of keywords from Russian Wikipedia
        static public void PutToFileAllPagesForKeyWordsRU()
        {
            SortedDictionary <String, Int64> keys       = PostGrePlugIn.getTableKeysAsDictionary();
            List <List <String> >            categories = HTMLDigger.ReturnWikiPages(keys);

            ObjectToSource.ComplexListToSource <String>(Common.pagesFile, categories);
        }
Exemple #3
0
        static public void ParseXMLwikiForEdits(String file)
        {
            XmlReader XMLreader = XmlReader.Create(new StreamReader(Common.wikixmlFile, Encoding.UTF8));

            PostGrePlugIn.openConnection();
            Console.OutputEncoding = Encoding.Unicode;

            StreamWriter editsStream   = new StreamWriter(Common.editsFile, false, Encoding.UTF8);
            StreamWriter editorsStream = new StreamWriter(Common.editorsFile, false, Encoding.UTF8);

            editsStream.Close();
            editorsStream.Close();

            List <String> editsStore = new List <string>();
            Dictionary <String, List <Object> > editorsStore = new Dictionary <string, List <Object> >();
            Int64 editsCounter   = 0;
            Int64 editorsCounter = 0;

            Boolean flagReadTitle    = false;
            Boolean flagReadBody     = false;
            Boolean flagReadPage     = false;
            String  pageName         = "";
            String  oldPage          = "";
            Boolean pageFlag         = false;
            Boolean readRevisionMode = false;
            String  editorID         = "";
            String  comment          = "";
            String  pageID           = "";


            try
            {
                while (XMLreader.Read())
                {
                    if (XMLreader.Name == "title" && (XMLreader.NodeType == XmlNodeType.Element))
                    {
                        flagReadTitle = true;
                        continue;
                    }

                    if (XMLreader.Name == "title" && (XMLreader.NodeType == XmlNodeType.EndElement))
                    {
                        flagReadTitle = false;
                        continue;
                    }

                    if (XMLreader.Name == "page" && (XMLreader.NodeType == XmlNodeType.Element))
                    {
                        flagReadPage = true;
                        continue;
                    }

                    if (XMLreader.Name == "page" && (XMLreader.NodeType == XmlNodeType.EndElement))
                    {
                        flagReadPage     = false;
                        readRevisionMode = false;
                        continue;
                    }


                    if (XMLreader.Name == "text" && (XMLreader.NodeType == XmlNodeType.Element))
                    {
                        flagReadBody = true;
                        continue;
                    }


                    if (XMLreader.Name == "text" && (XMLreader.NodeType == XmlNodeType.EndElement))
                    {
                        flagReadBody = false;
                        continue;
                    }

                    if (XMLreader.NodeType == XmlNodeType.Text && flagReadTitle)
                    {
                        pageName = XMLreader.Value;
                        if (Common.interestPages.Contains(QuotedPrintable.EncodeQuotedPrintable(pageName)))
                        {
                            readRevisionMode = true;
                        }
                        while (XMLreader.Name != "id")
                        {
                            XMLreader.Read();
                        }
                        XMLreader.Read();
                        pageID        = XMLreader.Value;
                        flagReadTitle = false;
                    }



                    if (XMLreader.Name == "revision" && XMLreader.NodeType == XmlNodeType.Element)
                    {
                        String   timeString = "";
                        DateTime day        = new DateTime();
                        String   ip         = "NA";
                        String   country    = "NA";
                        Byte     troll      = 0;
                        Int32    typetroll  = -1;
                        String   name       = "NA";
                        Int32    nameID     = -1;
                        Double   distance   = -1;
                        Byte     inGroup    = 0;

                        if (readRevisionMode)
                        {
                            inGroup = 1;
                        }

                        while (!(XMLreader.NodeType == XmlNodeType.EndElement && XMLreader.Name == "revision"))
                        {
                            XMLreader.Read();
                            if (XMLreader.NodeType == XmlNodeType.Element && XMLreader.Name == "timestamp")
                            {
                                XMLreader.Read();
                                timeString = XMLreader.Value;
                                day        = DateTime.Parse(timeString);
                                XMLreader.Read();
                            }

                            if (XMLreader.NodeType == XmlNodeType.Element && XMLreader.Name == "contributor")
                            {
                                while (XMLreader.Name != "ip" && XMLreader.Name != "username")
                                {
                                    XMLreader.Read();
                                }

                                if (XMLreader.Name == "ip")
                                {
                                    XMLreader.Read();
                                    ip      = editorID = XMLreader.Value;
                                    country = PostGrePlugIn.DataTableToList(PostGrePlugIn.getTablePostGre(PostGrePlugIn.ReturnIpQuery("countryipranges", ip, "country")))[0];
                                    PostGrePlugIn.GetTrollResults(PostGrePlugIn.getTablePostGre(PostGrePlugIn.ReturnIpQuery("troll_bases", ip, " c1,c2,c3,c4, distance ")), out troll, out distance, out typetroll);
                                }
                                else
                                {
                                    XMLreader.Read();
                                    name = XMLreader.Value.Replace(@"\", "_slash_");

                                    while (XMLreader.Name != "id")
                                    {
                                        XMLreader.Read();
                                    }
                                    XMLreader.Read();

                                    nameID   = Int32.Parse(XMLreader.Value);
                                    editorID = nameID.ToString();
                                }

                                while (XMLreader.Name != "contributor")
                                {
                                    XMLreader.Read();
                                }
                                XMLreader.Read();
                                XMLreader.Read();
                                if (XMLreader.Name == "comment")
                                {
                                    XMLreader.Read();
                                    comment = XMLreader.Value.Replace('\n', ' ');
                                    comment = comment.Replace(@"\", " ");
                                    comment = comment.Replace('\t', ' ');
                                }
                                else
                                {
                                    comment = "";
                                }

                                break;
                            }
                        }



                        if (!editorsStore.ContainsKey(editorID))
                        {
                            List <object> list = new List <object>();
                            list.Add(1);
                            list.Add(1 * inGroup);
                            name = name.Replace(@"\", "_slash_");
                            list.Add(ip + "\t" + name + "\t" + nameID + "\t" + country + "\t" + troll.ToString() + "\t" + typetroll.ToString() + "\t" + distance.ToString());
                            editorsStore.Add(editorID, list);
                        }
                        else
                        {
                            editorsStore[editorID][0] = Int32.Parse(editorsStore[editorID][0].ToString()) + 1;
                            editorsStore[editorID][1] = Int32.Parse(editorsStore[editorID][0].ToString()) * inGroup + 1;
                        }
                        if (distance > 0)
                        {
                            ;
                        }
                        editsStore.Add(pageName + "\t" + pageID + "\t" + inGroup.ToString() + "\t" + CleanExactTime(timeString) + "\t" + day.ToString("yyyy-MM-dd") + "\t" + ip + "\t" + name + "\t" + nameID + "\t" + country + "\t" + troll.ToString() + "\t" + typetroll.ToString() + "\t" + distance.ToString() + "\t" + comment);
                        if (editsStore.Count > 10000)
                        {
                            editorsStore = ReleaseFile(Common.editorsFile, editorsStore);
                        }
                        if (editsStore.Count > 10000)
                        {
                            editsStore = ReleaseFile(Common.editsFile, editsStore);
                        }
                    }
                }
            }

            catch (XmlException ex)
            {
                Console.WriteLine("Ended abnormally..." + ex);
            }
            eend :;
        }
Exemple #4
0
        static public void BringRowsTogetherAggregatedComment(String fileTo, String sqlRequest)
        {
            StreamWriter sw = new StreamWriter(fileTo, false);

            DataTable     dt        = PostGrePlugIn.getTablePostGre(sqlRequest);
            List <String> countries = PostGrePlugIn.DataTableToList(PostGrePlugIn.getTablePostGre(Common.getCountriesDB));

            DateTime date     = new DateTime();
            String   pagename = "";
            Dictionary <String, String> values = new Dictionary <string, String>();

            values.Add("troll", "0");
            values.Add("NS", "0");

            foreach (String c in countries)
            {
                values.Add(c, "0");
            }
            String outta     = "";
            bool   firstFlag = true;

            foreach (DataRow dr in dt.Rows)
            {
                if ((DateTime.Parse(dr[2].ToString()) != date || dr[0].ToString().Trim() != pagename) && !firstFlag)
                {
                    outta = date.ToString("yyyy-MM-dd") + "\t" + pagename + "\t";
                    foreach (String s in values.Keys)
                    {
                        outta += values[s].ToString() + "\t";
                    }
                    outta = outta.Substring(0, outta.Length - 1);
                    Int32 totalnontroll = 0;
                    foreach (String key in values.Keys)
                    {
                        if (key == "troll")
                        {
                            continue;
                        }
                        totalnontroll += Int32.Parse(values[key]);
                    }
                    sw.WriteLine(outta + "\t" + totalnontroll.ToString());
                    values = new Dictionary <string, string>();

                    values.Add("troll", "0");
                    values.Add("NS", "0");
                    foreach (String c in countries)
                    {
                        values.Add(c, "0");
                    }
                }
                firstFlag = false;
                date      = DateTime.Parse(dr[2].ToString());
                pagename  = dr[0].ToString().Trim();

                if (dr[3].ToString() == "1")
                {
                    values["troll"] = dr[5].ToString();
                    continue;
                }

                if (dr[4].ToString() == "")
                {
                    values["NS"] = dr[5].ToString();
                    continue;
                }

                values[dr[4].ToString().Trim()] = dr[5].ToString();
            }
            sw.Close();
        }
Exemple #5
0
 static public void CreateEditsAndEditorsTables()
 {
     Common.interestPages = PostGrePlugIn.DataTableToHashSet(PostGrePlugIn.getTablePostGre(Common.getPagesSQL));
     ParseXML.ParseXMLwikiForEdits(Common.wikixmlFile);
 }