public DocObject CrawlWikipediaEntry(object WikipediaEntry) { try { WikipediaEntryObject we = (WikipediaEntryObject)WikipediaEntry; string text; DateTime date; XElement el3 = we.el.Element("{" + we.el.Name.Namespace + "}revision"); if (el3 != null) { XElement el4 = el3.Element("{" + we.el.Name.Namespace + "}timestamp"); if (el4 != null) { date = TimeZoneToUtc(el4.Value, "en"); } else { date = DateTime.UtcNow; } XElement el5 = el3.Element("{" + we.el.Name.Namespace + "}text"); if (el5 != null) { text = el5.Value; } else { text = ""; } } else { date = DateTime.UtcNow; text = ""; } DocObject doc = new DocObject(); //internal links //external links //image links StripWikiTags(text, ref doc, we.title); doc.url = we.url; //title rewriting doc.title = we.title + " - Wikipedia";; doc.docDate = date; doc.domain = Url2domain(doc.url); return(doc); } catch (Exception e9) { Trace.WriteLine(DateTime.UtcNow.ToString() + " " + "CrawlWikipediaEntry: " + e9.Message); return(null); } }
public void ReadWikipedia(object param) { // index = true: index // index = false: export (string inputPath, string outputPath, string format, string urlPrefix)parameter = ((string inputPath, string outputPath, string format, string urlPrefix))param; if (!File.Exists(parameter.inputPath)) { Console.WriteLine("Wikipedia dump not found: " + parameter.inputPath); return; } else { Console.WriteLine("Wikipedia export started: " + parameter.inputPath + " -> " + parameter.outputPath + " ..."); } long size = 0; long count = 0; string title = ""; bool skip = false; bool isText = (parameter.format == "text"); using (FileStream outputFileStream = File.Create(parameter.outputPath)) { if (!isText) { outputFileStream.Write(openingBracketByte); } using (FileStream inputFileStream = new FileStream(parameter.inputPath, FileMode.Open)) { using (var reader = XmlReader.Create(inputFileStream)) { size = inputFileStream.Length; //continue if (wikipediaCount > count) { count = wikipediaCount; //start before, and skip until last title reached if (wikipediaPosition > 10000000) { wikipediaPosition -= 10000000; skip = true; } else if (wikipediaPosition > 1000000) { wikipediaPosition -= 1000000; skip = true; } inputFileStream.Seek(wikipediaPosition, SeekOrigin.Begin); } while (reader.Read()) { try { if (reader.NodeType == XmlNodeType.Element) { if (reader.Name == "page") { WikipediaEntryObject WikipediaEntry = new WikipediaEntryObject { el = XNode.ReadFrom(reader) as XElement }; if (WikipediaEntry.el != null) { //----------------- string redirectTitle = ""; title = WikipediaEntry.el.Element("{" + WikipediaEntry.el.Name.Namespace + "}title").Value; try { XElement redirectElement = WikipediaEntry.el.Element("{" + WikipediaEntry.el.Name.Namespace + "}redirect"); if (redirectElement != null) { redirectTitle = redirectElement.FirstAttribute.Value; } } catch (Exception e) { Trace.WriteLine(DateTime.UtcNow.ToString() + " " + "title exception: " + e.Message); } //no internal wiki pages, no forwarding if (!title.Contains(":") && String.IsNullOrEmpty(redirectTitle)) { if (skip) { if (title == wikipediaTitle) { skip = false; } continue; } WikipediaEntry.url = "http://" + parameter.urlPrefix + ".wikipedia.org/wiki/" + Uri.EscapeUriString(title.Replace(" ", "_")); WikipediaEntry.urlPrefix = parameter.urlPrefix; count++; //urlprefix, title, el WikipediaEntry.title = title; DocObject doc = CrawlWikipediaEntry(WikipediaEntry); if (doc != null) { if (isText) { byte[] info = Encoding.UTF8.GetBytes( doc.url + Environment.NewLine + doc.domain + Environment.NewLine + DateToJsonDouble(doc.docDate).ToString() + Environment.NewLine + doc.title + Environment.NewLine + doc.text.Replace("\r", " ") + Environment.NewLine ); outputFileStream.Write(info, 0, info.Length); } else { Wikipedia.DocJson docJson = new Wikipedia.DocJson { url = doc.url, domain = doc.domain, title = doc.title, content = doc.text, docDate = DateToJsonDouble(doc.docDate) }; if (count > 1) { outputFileStream.Write(commaByte); } outputFileStream.Write(JsonSerializer.SerializeToUtf8Bytes(docJson, jsonSerializerOptions)); } if ((count % 100000) == 0) { Console.WriteLine("docs: " + count.ToString("N0")); } } } //--- } } } } catch (Exception e) { Trace.WriteLine(DateTime.UtcNow.ToString() + " " + "wikipedia exception: " + e.Message); } } } } if (!isText) { outputFileStream.Write(closingBracketByte); } }//end of AppendText Console.WriteLine("Wikipedia export finished: docs: " + count.ToString("N0")); }