Exemplo n.º 1
0
            public DocObject CrawlWikipediaEntry(object WikipediaEntry)
            {
                try
                {
                    WikipediaEntryObject we = (WikipediaEntryObject)WikipediaEntry;

                    string   text;
                    DateTime date;
                    XElement el3 = we.el.Element("{" + we.el.Name.Namespace + "}revision");
                    if (el3 != null)
                    {
                        XElement el4 = el3.Element("{" + we.el.Name.Namespace + "}timestamp");
                        if (el4 != null)
                        {
                            date = TimeZoneToUtc(el4.Value, "en");
                        }
                        else
                        {
                            date = DateTime.UtcNow;
                        }

                        XElement el5 = el3.Element("{" + we.el.Name.Namespace + "}text");
                        if (el5 != null)
                        {
                            text = el5.Value;
                        }
                        else
                        {
                            text = "";
                        }
                    }
                    else
                    {
                        date = DateTime.UtcNow;
                        text = "";
                    }

                    DocObject doc = new DocObject();

                    //internal links
                    //external links
                    //image links

                    StripWikiTags(text, ref doc, we.title);

                    doc.url = we.url;
                    //title rewriting
                    doc.title   = we.title + " - Wikipedia";;
                    doc.docDate = date;
                    doc.domain  = Url2domain(doc.url);

                    return(doc);
                }
                catch (Exception e9) { Trace.WriteLine(DateTime.UtcNow.ToString() + " " + "CrawlWikipediaEntry: " + e9.Message); return(null); }
            }
Exemplo n.º 2
0
            public void ReadWikipedia(object param)
            {
                // index = true:  index
                // index = false: export
                (string inputPath, string outputPath, string format, string urlPrefix)parameter = ((string inputPath, string outputPath, string format, string urlPrefix))param;

                if (!File.Exists(parameter.inputPath))
                {
                    Console.WriteLine("Wikipedia dump not found: " + parameter.inputPath);
                    return;
                }
                else
                {
                    Console.WriteLine("Wikipedia export started: " + parameter.inputPath + " -> " + parameter.outputPath + " ...");
                }

                long   size   = 0;
                long   count  = 0;
                string title  = "";
                bool   skip   = false;
                bool   isText = (parameter.format == "text");

                using (FileStream outputFileStream = File.Create(parameter.outputPath))
                {
                    if (!isText)
                    {
                        outputFileStream.Write(openingBracketByte);
                    }

                    using (FileStream inputFileStream = new FileStream(parameter.inputPath, FileMode.Open))
                    {
                        using (var reader = XmlReader.Create(inputFileStream))
                        {
                            size = inputFileStream.Length;

                            //continue
                            if (wikipediaCount > count)
                            {
                                count = wikipediaCount;

                                //start before, and skip until last title reached
                                if (wikipediaPosition > 10000000)
                                {
                                    wikipediaPosition -= 10000000; skip = true;
                                }
                                else
                                if (wikipediaPosition > 1000000)
                                {
                                    wikipediaPosition -= 1000000; skip = true;
                                }

                                inputFileStream.Seek(wikipediaPosition, SeekOrigin.Begin);
                            }


                            while (reader.Read())
                            {
                                try
                                {
                                    if (reader.NodeType == XmlNodeType.Element)
                                    {
                                        if (reader.Name == "page")
                                        {
                                            WikipediaEntryObject WikipediaEntry = new WikipediaEntryObject
                                            {
                                                el = XNode.ReadFrom(reader) as XElement
                                            };

                                            if (WikipediaEntry.el != null)
                                            {
                                                //-----------------

                                                string redirectTitle = "";
                                                title = WikipediaEntry.el.Element("{" + WikipediaEntry.el.Name.Namespace + "}title").Value;
                                                try
                                                {
                                                    XElement redirectElement = WikipediaEntry.el.Element("{" + WikipediaEntry.el.Name.Namespace + "}redirect");
                                                    if (redirectElement != null)
                                                    {
                                                        redirectTitle = redirectElement.FirstAttribute.Value;
                                                    }
                                                }
                                                catch (Exception e) { Trace.WriteLine(DateTime.UtcNow.ToString() + " " + "title exception: " + e.Message); }


                                                //no internal wiki pages, no forwarding
                                                if (!title.Contains(":") && String.IsNullOrEmpty(redirectTitle))
                                                {
                                                    if (skip)
                                                    {
                                                        if (title == wikipediaTitle)
                                                        {
                                                            skip = false;
                                                        }
                                                        continue;
                                                    }

                                                    WikipediaEntry.url       = "http://" + parameter.urlPrefix + ".wikipedia.org/wiki/" + Uri.EscapeUriString(title.Replace(" ", "_"));
                                                    WikipediaEntry.urlPrefix = parameter.urlPrefix;

                                                    count++;

                                                    //urlprefix, title, el
                                                    WikipediaEntry.title = title;
                                                    DocObject doc = CrawlWikipediaEntry(WikipediaEntry);
                                                    if (doc != null)
                                                    {
                                                        if (isText)
                                                        {
                                                            byte[] info = Encoding.UTF8.GetBytes(
                                                                doc.url + Environment.NewLine +
                                                                doc.domain + Environment.NewLine +
                                                                DateToJsonDouble(doc.docDate).ToString() + Environment.NewLine +
                                                                doc.title + Environment.NewLine +
                                                                doc.text.Replace("\r", " ") + Environment.NewLine
                                                                );
                                                            outputFileStream.Write(info, 0, info.Length);
                                                        }
                                                        else
                                                        {
                                                            Wikipedia.DocJson docJson = new Wikipedia.DocJson
                                                            {
                                                                url     = doc.url,
                                                                domain  = doc.domain,
                                                                title   = doc.title,
                                                                content = doc.text,
                                                                docDate = DateToJsonDouble(doc.docDate)
                                                            };
                                                            if (count > 1)
                                                            {
                                                                outputFileStream.Write(commaByte);
                                                            }
                                                            outputFileStream.Write(JsonSerializer.SerializeToUtf8Bytes(docJson, jsonSerializerOptions));
                                                        }

                                                        if ((count % 100000) == 0)
                                                        {
                                                            Console.WriteLine("docs: " + count.ToString("N0"));
                                                        }
                                                    }
                                                }

                                                //---
                                            }
                                        }
                                    }
                                }
                                catch (Exception e)
                                {
                                    Trace.WriteLine(DateTime.UtcNow.ToString() + " " + "wikipedia exception: " + e.Message);
                                }
                            }
                        }
                    }

                    if (!isText)
                    {
                        outputFileStream.Write(closingBracketByte);
                    }
                }//end of AppendText
                Console.WriteLine("Wikipedia export finished:   docs: " + count.ToString("N0"));
            }