示例#1
0
        /// <summary>
        /// Loads data from the Marvel Chronology Project (MCP) site
        /// and saves it into the specified directory. Note that KEY is
        /// not extracted, you have to get it manually.
        /// </summary>
        /// <param name="pathResultsDir">
        /// Path to the directory where the results will be saved.
        /// </param>
        public static void LoadGraphFromInternet(string pathResultsDir)
        {
            Console.WriteLine("Starting to extract data from Marvel Chronology Project . . .");

            HashSet <string> setCharactersID = new HashSet <string>();

            List <string> inputPages = ReadInputPages();

            int i = 1;

            foreach (string page in inputPages)
            {
                // Load html page
                Console.WriteLine("Loading page " + i + " of " + inputPages.Count + " . . .");
                HtmlWeb      htmlWeb  = new HtmlWeb();
                HtmlDocument htmlPage = htmlWeb.Load(page);

                // Start data extraction
                Console.WriteLine("Extracting data from " + page + " . . .");
                var tagTitle = htmlPage.DocumentNode.SelectNodes("html/head/title");
                foreach (HtmlNode node in tagTitle)
                {
                    Console.WriteLine("Page title: " + node.InnerText);
                }

                List <HtmlNode> chrons = htmlPage.DocumentNode.Descendants("div").Where(x => x.Id == "chrons").ToList();

                if (chrons.Count > 1)
                {
                    Console.WriteLine("ERROR! More than 1 element with ID = \"chrons\" has been found.");
                    return;
                }

                var charactersInfo = from HtmlNode pTag in chrons[0].Descendants("p")
                                     where (pTag.ChildNodes.Where(x => x.Name == "span").Count() == 2)
                                     select new
                {
                    Name = (((from HtmlNode spanChar in pTag.Descendants("span")
                              where (spanChar.Attributes["class"].Value == "char")
                              select spanChar.Descendants("b"))
                             .ElementAt(0).ElementAt(0) as HtmlNode)
                            .InnerText as string)
                           .Trim().ToLower()
                           .Replace(',', ' ').Replace('\n', ' ').Replace("  ", " "),

                    Id = (pTag.Id as string)
                         .Trim().ToLower(),

                    Links = (from HtmlNode tagA in pTag.Descendants("a")
                             where (tagA.Attributes["href"] != null)
                             select tagA.Attributes["href"].Value)
                            .Select(x => x.Trim().ToLower())
                            .ToList <string>(),

                    Comics = ((from HtmlNode spanChron in pTag.Descendants("span")
                               where (spanChron.Attributes["class"].Value == "chron")
                               select spanChron.InnerText)
                              .ElementAt(0) as string)
                             .Trim().ToLower()
                             .Replace(',', ' ').Replace("  ", " ")
                             .Split('\n')
                             .ToList <string>()
                };

                Console.WriteLine("Saving data . . .");

                // Example: MAIN_DIR\a. Needed letter is the 5th character from the end because
                // every URL ends with ?.php. UGLY
                string currentLetter = page[page.Length - 5].ToString();
                string pathTargetDir = Path.Combine(pathResultsDir, currentLetter);
                if (!Directory.Exists(pathTargetDir))
                {
                    Directory.CreateDirectory(pathTargetDir);
                }

                using (StreamWriter writerChNames = new StreamWriter(FileManager.GetNamesFilePath(pathTargetDir), false))
                    using (StreamWriter writerChComics = new StreamWriter(FileManager.GetComicsFilePath(pathTargetDir), false))
                        using (StreamWriter writerChLinks = new StreamWriter(FileManager.GetLinksFilePath(pathTargetDir), false))
                        {
                            foreach (var character in charactersInfo)
                            {
                                string uniqueCharacterId = character.Id;

                                if (uniqueCharacterId.Equals(String.Empty))
                                {
                                    uniqueCharacterId = Guid.NewGuid().ToString();
                                    Console.WriteLine(String.Format("WARNING! Empty ID was detected. Unique GUID was generated instead. GUID: {0}. Character's Name: {1}.",
                                                                    uniqueCharacterId, character.Name));
                                }
                                else
                                {
                                    uniqueCharacterId = IDHelper.AddIDParentLetter(IDHelper.NormalizeID(character.Id), currentLetter);

                                    int index = 1;
                                    while (setCharactersID.Contains(uniqueCharacterId))
                                    {
                                        uniqueCharacterId = IDHelper.AddIDParentLetter(IDHelper.NormalizeID(character.Id), currentLetter) + String.Format("({0})", index);
                                        ++index;
                                    }

                                    setCharactersID.Add(uniqueCharacterId);
                                }

                                // Write names
                                writerChNames.WriteLine(String.Format("{0},{1}", uniqueCharacterId, character.Name));

                                // Write comics
                                writerChComics.Write(uniqueCharacterId);
                                foreach (string comic in character.Comics)
                                {
                                    writerChComics.Write(String.Format(",{0}", comic));
                                }
                                writerChComics.WriteLine();

                                //// Write links
                                if ((character.Links != null) && (character.Links.Count() > 0))
                                {
                                    writerChLinks.Write(uniqueCharacterId);
                                    foreach (string link in character.Links)
                                    {
                                        writerChLinks.Write(String.Format(",{0}", link));
                                    }
                                    writerChLinks.WriteLine();
                                }
                            }
                        }

                Console.WriteLine("Finished processing " + page);

                ++i;
            }

            Console.WriteLine("Data extraction finished.");
        }