public static HashSet <String> DataTableToHashSet(DataTable dt) { HashSet <String> hs = new HashSet <string>(); foreach (DataRow dr in dt.Rows) { hs.Add(QuotedPrintable.EncodeQuotedPrintable(dr[0].ToString().Trim())); } return(hs); }
public static void PrintCategogy(StreamWriter sw, String page, String source, Boolean pageFlag) { Boolean first = pageFlag; foreach (String key in Common.keys.Keys) { if (source.Contains(":" + key) && first) { sw.WriteLine(page + "\t" + QuotedPrintable.EncodeQuotedPrintable(page) + "\t" + key); first = false; Common.keysPower[key] = 1; continue; } if (source.Contains(":" + key)) { Common.keysPower[key]++; } } }
static public List <List <String> > ReturnWikiCategory(SortedDictionary <String, Int64> keys) { List <List <String> > buf = new List <List <String> >(); var document = new HtmlDocument(); var client = new WebClient(); bool over = true; String baseString = "https://ru.wikipedia.org"; //foreach (Char c in Common.alpha) String tail = "/w/index.php?title=Служебная:Все_страницы&from=%28hed%29+P.E.&namespace=14"; do { var stream = client.OpenRead(baseString + tail); var reader = new StreamReader(stream, Encoding.GetEncoding("UTF-8")); var html = reader.ReadToEnd(); document.LoadHtml(html); String tags = "//li"; HtmlNodeCollection nodes = document.DocumentNode.SelectNodes(tags); foreach (HtmlNode node in nodes) { if (!node.InnerText.Contains("Категория:")) { break; } String value = node.InnerText.Substring("Категория:".Length).ToLower(); foreach (String key in keys.Keys) { if (value.Contains(key.ToLower())) { List <String> pair = new List <string>(); pair.Add(key); pair.Add(node.InnerText.Substring("Категория:".Length)); pair.Add("14"); buf.Add(pair); } } } tags = "//div[@class='mw-allpages-nav']"; nodes = document.DocumentNode.SelectNodes(tags); foreach (HtmlNode node in nodes) { String val = node.InnerHtml; if (node.InnerText.Contains("|")) { val = val.Substring(val.IndexOf("|")); } if (node.InnerText.Contains("Следующая страница")) { int firstpos = val.IndexOf("\""); int lastpos = val.Substring(firstpos + 1).IndexOf("\""); tail = val.Substring(firstpos + 1, lastpos); tail = QuotedPrintable.DecodeQuotedPrintable(tail, "Привет"); tail = tail.Replace("amp;", ""); goto jump; } } over = false; jump :; } while (over); return(buf); }
static public void ParseXMLwikiForEdits(String file) { XmlReader XMLreader = XmlReader.Create(new StreamReader(Common.wikixmlFile, Encoding.UTF8)); PostGrePlugIn.openConnection(); Console.OutputEncoding = Encoding.Unicode; StreamWriter editsStream = new StreamWriter(Common.editsFile, false, Encoding.UTF8); StreamWriter editorsStream = new StreamWriter(Common.editorsFile, false, Encoding.UTF8); editsStream.Close(); editorsStream.Close(); List <String> editsStore = new List <string>(); Dictionary <String, List <Object> > editorsStore = new Dictionary <string, List <Object> >(); Int64 editsCounter = 0; Int64 editorsCounter = 0; Boolean flagReadTitle = false; Boolean flagReadBody = false; Boolean flagReadPage = false; String pageName = ""; String oldPage = ""; Boolean pageFlag = false; Boolean readRevisionMode = false; String editorID = ""; String comment = ""; String pageID = ""; try { while (XMLreader.Read()) { if (XMLreader.Name == "title" && (XMLreader.NodeType == XmlNodeType.Element)) { flagReadTitle = true; continue; } if (XMLreader.Name == "title" && (XMLreader.NodeType == XmlNodeType.EndElement)) { flagReadTitle = false; continue; } if (XMLreader.Name == "page" && (XMLreader.NodeType == XmlNodeType.Element)) { flagReadPage = true; continue; } if (XMLreader.Name == "page" && (XMLreader.NodeType == XmlNodeType.EndElement)) { flagReadPage = false; readRevisionMode = false; continue; } if (XMLreader.Name == "text" && (XMLreader.NodeType == XmlNodeType.Element)) { flagReadBody = true; continue; } if (XMLreader.Name == "text" && (XMLreader.NodeType == XmlNodeType.EndElement)) { flagReadBody = false; continue; } if (XMLreader.NodeType == XmlNodeType.Text && flagReadTitle) { pageName = XMLreader.Value; if (Common.interestPages.Contains(QuotedPrintable.EncodeQuotedPrintable(pageName))) { readRevisionMode = true; } while (XMLreader.Name != "id") { XMLreader.Read(); } XMLreader.Read(); pageID = XMLreader.Value; flagReadTitle = false; } if (XMLreader.Name == "revision" && XMLreader.NodeType == XmlNodeType.Element) { String timeString = ""; DateTime day = new DateTime(); String ip = "NA"; String country = "NA"; Byte troll = 0; Int32 typetroll = -1; String name = "NA"; Int32 nameID = -1; Double distance = -1; Byte inGroup = 0; if (readRevisionMode) { inGroup = 1; } while (!(XMLreader.NodeType == XmlNodeType.EndElement && XMLreader.Name == "revision")) { XMLreader.Read(); if (XMLreader.NodeType == XmlNodeType.Element && XMLreader.Name == "timestamp") { XMLreader.Read(); timeString = XMLreader.Value; day = DateTime.Parse(timeString); XMLreader.Read(); } if (XMLreader.NodeType == XmlNodeType.Element && XMLreader.Name == "contributor") { while (XMLreader.Name != "ip" && XMLreader.Name != "username") { XMLreader.Read(); } if (XMLreader.Name == "ip") { XMLreader.Read(); ip = editorID = XMLreader.Value; country = PostGrePlugIn.DataTableToList(PostGrePlugIn.getTablePostGre(PostGrePlugIn.ReturnIpQuery("countryipranges", ip, "country")))[0]; PostGrePlugIn.GetTrollResults(PostGrePlugIn.getTablePostGre(PostGrePlugIn.ReturnIpQuery("troll_bases", ip, " c1,c2,c3,c4, distance ")), out troll, out distance, out typetroll); } else { XMLreader.Read(); name = XMLreader.Value.Replace(@"\", "_slash_"); while (XMLreader.Name != "id") { XMLreader.Read(); } XMLreader.Read(); nameID = Int32.Parse(XMLreader.Value); editorID = nameID.ToString(); } while (XMLreader.Name != "contributor") { XMLreader.Read(); } XMLreader.Read(); XMLreader.Read(); if (XMLreader.Name == "comment") { XMLreader.Read(); comment = XMLreader.Value.Replace('\n', ' '); comment = comment.Replace(@"\", " "); comment = comment.Replace('\t', ' '); } else { comment = ""; } break; } } if (!editorsStore.ContainsKey(editorID)) { List <object> list = new List <object>(); list.Add(1); list.Add(1 * inGroup); name = name.Replace(@"\", "_slash_"); list.Add(ip + "\t" + name + "\t" + nameID + "\t" + country + "\t" + troll.ToString() + "\t" + typetroll.ToString() + "\t" + distance.ToString()); editorsStore.Add(editorID, list); } else { editorsStore[editorID][0] = Int32.Parse(editorsStore[editorID][0].ToString()) + 1; editorsStore[editorID][1] = Int32.Parse(editorsStore[editorID][0].ToString()) * inGroup + 1; } if (distance > 0) { ; } editsStore.Add(pageName + "\t" + pageID + "\t" + inGroup.ToString() + "\t" + CleanExactTime(timeString) + "\t" + day.ToString("yyyy-MM-dd") + "\t" + ip + "\t" + name + "\t" + nameID + "\t" + country + "\t" + troll.ToString() + "\t" + typetroll.ToString() + "\t" + distance.ToString() + "\t" + comment); if (editsStore.Count > 10000) { editorsStore = ReleaseFile(Common.editorsFile, editorsStore); } if (editsStore.Count > 10000) { editsStore = ReleaseFile(Common.editsFile, editsStore); } } } } catch (XmlException ex) { Console.WriteLine("Ended abnormally..." + ex); } eend :; }