예제 #1
0
        public static async Task <WikipediaData> GetWiki(string searchQuery)
        {
            var wiki = new WikipediaData();

            using (var client = new HttpClient())
            {
                try
                {
                    var stringJson =
                        await client.GetStringAsync(
                            $@"https://en.wikipedia.org/w/api.php?action=opensearch&search={searchQuery}&limit=1&namespace=0&format=json");

                    JToken       token           = JToken.Parse(stringJson);
                    const string quote           = "\"";
                    string       patternDesc     = $@"\([^()]*\)|\r\n|{quote}|\\";
                    string       patternBrackets = $@"[\[\]']|\r\n|{quote}|\\";
                    wiki.Description = Regex.Replace(
                        Regex.Replace(
                            Regex.Replace(HtmlEntity.DeEntitize(token[2].ToString()), patternDesc, ""),
                            patternDesc, "").Trim(),
                        patternBrackets, "");
                    wiki.Link = Regex.Replace(token[3].ToString(), patternBrackets, "").Trim();
                }
                catch
                {
                    wiki.Description = "";
                    wiki.Link        = "";
                }
            }

            return(wiki);
        }
예제 #2
0
		private void AddHeaderQuestion(WikipediaData wiki_data, string subject, string header, string data, bool probably_person)
		{
			string statement = "";
			if ((header == "Carbohydrates") ||
			    (header == "Fat") ||
			    (header == "Protein"))
			{
				if (header == "Carbohydrates") header = "carbohydrate";
				statement = subject + " has a " + header.ToLower() + " content of " + data;
				wiki_data.content[wiki_data.content_entries++] = statement;				
			}
			if (header == "Birth name")
			{
				statement = subject + " was born named " + data;
				//Console.WriteLine(header + "      " + statement);
				wiki_data.content[wiki_data.content_entries++] = statement;				
			}
			if (header == "Conservation status")
			{
				statement = subject + " has the conservation status " + data;
				//Console.WriteLine(header + "      " + statement);
				wiki_data.content[wiki_data.content_entries++] = statement;				
			}
			if (header == "Location")
			{
				statement = subject + " is located in " + data;
				//Console.WriteLine(header + "      " + statement);
				wiki_data.content[wiki_data.content_entries++] = statement;				
			}
			if (header == "Date")
			{
				if (ContainsMonth(data))
				{
					if (data.Contains("-"))
					{
						string[] datastr = data.Split('-');
				        statement = subject + " occurred between " + datastr[0].Trim() + " and " + datastr[1].Trim();
					}
					else
					{
						statement = subject + " occurred on " + data;
					}
				    //Console.WriteLine(header + "      " + statement);
				    wiki_data.content[wiki_data.content_entries++] = statement;				
				}
			}
			if (header == "Binomial name")
			{
				string[] wrds = data.Split(',');
				statement = subject + " has the latin name " + wrds[0];
				//Console.WriteLine(header + "      " + statement);
				wiki_data.content[wiki_data.content_entries++] = statement;				
			}
			if (header == "Subspecies")
			{
				string[] wrds = data.Split(',');
				if (!wrds[0].ToLower().StartsWith("see "))
				{
				    statement = subject + " belongs to the subspecies " + wrds[0];
				    //Console.WriteLine(header + "      " + statement);
				    wiki_data.content[wiki_data.content_entries++] = statement;				
				}
			}
			if ((header == "Birthplace") ||
			    (header == "Birth place"))
			{
				statement = subject + " was born in " + data;
				//Console.WriteLine(header + "      " + statement);
				wiki_data.content[wiki_data.content_entries++] = statement;				
			}
			if (header == "Industry")
			{
				statement = subject + " belongs to the industry " + data;
				//Console.WriteLine(header + "      " + statement);
				wiki_data.content[wiki_data.content_entries++] = statement;				
			}
			if (header == "Time zone")
			{
				statement = subject + " is in the time zone " + data;
				//Console.WriteLine(header + "      " + statement);
				wiki_data.content[wiki_data.content_entries++] = statement;				
			}
			if (header == "Currency")
			{
				statement = subject + " uses the currency " + data;
				//Console.WriteLine(header + "      " + statement);
				wiki_data.content[wiki_data.content_entries++] = statement;				
			}
			if ((header.ToLower() == "ethnic group") ||
			    (header.ToLower() == "ethnicity"))
			{
				statement = subject + " belongs to the ethnic group " + data;
				//Console.WriteLine(header + "      " + statement);
				wiki_data.content[wiki_data.content_entries++] = statement;				
			}
			if ((header == "Preceded by") ||
			    (header == "Precededby"))
			{
				statement = subject + " was preceded by " + data;
				//Console.WriteLine(header + "      " + statement);
				wiki_data.content[wiki_data.content_entries++] = statement;				
			}
			if (header.Contains("Previous mission"))
			{
				statement = subject + " was preceded by " + data;
				//Console.WriteLine(header + "      " + statement);
				wiki_data.content[wiki_data.content_entries++] = statement;				
			}
			if (header == "Memory")
			{
				statement = subject + " has a memory of " + data;
				//Console.WriteLine(header + "      " + statement);
				wiki_data.content[wiki_data.content_entries++] = statement;				
			}
			if (header == "Reign")
			{
				if (data.Contains("-"))
				{
					string[] str = data.Split('-');
				    statement = subject + " reigned between " + str[0].Trim() + " and " + str[1].Trim();
				    //Console.WriteLine(header + "      " + statement);
				    wiki_data.content[wiki_data.content_entries++] = statement;				
				}
			}
			if (header == "Coronation")
			{
		        statement = subject + " was crowned on " + data;
				//Console.WriteLine(header + "      " + statement);
				wiki_data.content[wiki_data.content_entries++] = statement;				
			}
			if (header == "Religious stance")
			{
		        statement = subject + " has the religious stance " + data;
				//Console.WriteLine(header + "      " + statement);
				wiki_data.content[wiki_data.content_entries++] = statement;				
			}
			if (header.ToLower().Contains("awards"))
			{
		        statement = subject + " won awards " + data;
				//Console.WriteLine(header + "      " + statement);
				wiki_data.content[wiki_data.content_entries++] = statement;				
			}
			if (header.ToLower().Contains("advisor"))
			{
		        statement = subject + " was advised by " + data;
				//Console.WriteLine(header + "      " + statement);
				wiki_data.content[wiki_data.content_entries++] = statement;				
			}
			if ((header == "Followed by") ||
			    (header == "Followedby") ||
			    (header == "Succeededby") ||
			    (header == "Succeeded by"))
			{
				statement = subject + " was followed by " + data;
				//Console.WriteLine(header + "      " + statement);
				wiki_data.content[wiki_data.content_entries++] = statement;				
			}
			if (header.Contains("Next mission"))
			{
				statement = data + " was followed by " + subject;
				//Console.WriteLine(header + "      " + statement);
				wiki_data.content[wiki_data.content_entries++] = statement;				
			}
			if (header.Contains("Launched"))
			{
				statement = subject + " was launched on " + data;
				//Console.WriteLine(header + "      " + statement);
				wiki_data.content[wiki_data.content_entries++] = statement;				
			}
			if (header.Contains("Orbital period"))
			{
				statement = subject + " has an orbital period of " + data;
				//Console.WriteLine(header + "      " + statement);
				wiki_data.content[wiki_data.content_entries++] = statement;				
			}
			if (header.Contains("Eccentricity"))
			{
				statement = subject + " has an orbital eccentricity of " + data;
				//Console.WriteLine(header + "      " + statement);
				wiki_data.content[wiki_data.content_entries++] = statement;				
			}
			if (header.Contains("Discovery date"))
			{
				string[] str = data.Split(' ');
				if (str.Length == 1)
				{
					statement = subject + " was discovered in " + data;
				}
				else
				{
				    statement = subject + " was discovered on " + data;
				}
				//Console.WriteLine(header + "      " + statement);
				wiki_data.content[wiki_data.content_entries++] = statement;				
			}
			if (header.Contains("Publication date"))
			{
				string[] str = data.Split(' ');
				if (str.Length == 1)
				{
					statement = subject + " was published in " + data;
				}
				else
				{
				    statement = subject + " was published on " + data;
				}
				//Console.WriteLine(header + "      " + statement);
				wiki_data.content[wiki_data.content_entries++] = statement;				
			}
			if (header.Contains("Publisher"))
			{
			    statement = subject + " was published by " + data;
				//Console.WriteLine(header + "      " + statement);
				wiki_data.content[wiki_data.content_entries++] = statement;				
			}
			if (header.Contains("Media type"))
			{
			    statement = subject + " has the media type " + data;
				//Console.WriteLine(header + "      " + statement);
				wiki_data.content[wiki_data.content_entries++] = statement;				
			}
			if (header.Contains("Author"))
			{
			    statement = subject + " was written by " + data;
				//Console.WriteLine(header + "      " + statement);
				wiki_data.content[wiki_data.content_entries++] = statement;				
			}
			if (header.Contains("Patron saint"))
			{
			    statement = subject + " has the patron saint " + data;
				//Console.WriteLine(header + "      " + statement);
				wiki_data.content[wiki_data.content_entries++] = statement;				
			}
			if (header.Contains("Discovered by"))
			{
				statement = subject + " was discovered by " + data;
				//Console.WriteLine(header + "      " + statement);
				wiki_data.content[wiki_data.content_entries++] = statement;				
			}
			if (header == "Death")
			{
				string[] wrds = data.Split(' ');
				if ((wrds.Length>1) &&
				    (!data.Contains(" BC")) &&
				    (!data.Contains(" B.C.")) &&
                    (!data.Contains(" AD")) &&
				    (!data.Contains(" A.D.")))
				{
					if (ContainsMonth(data))
				        statement = subject + " died on " + data;
					else
						statement = "";
				}
				else
				{
					statement = subject + " died in " + data;
				}
				if (statement != "")
				{
				    //Console.WriteLine(header + "      " + statement);
				    wiki_data.content[wiki_data.content_entries++] = statement;				
				}
			}
			if (header == "Birth")
			{
				string[] wrds = data.Split(' ');
				if ((wrds.Length>1) &&
				    (!data.Contains(" BC")) &&
				    (!data.Contains(" B.C.")) &&
                    (!data.Contains(" AD")) &&
				    (!data.Contains(" A.D.")))
				{
					if (ContainsMonth(data))
					{
				        statement = subject + " born on " + data;
					}
					else statement = "";
				}
				else
				{
					statement = subject + " born in " + data;
				}
				if (statement != "")
				{
				    //Console.WriteLine(header + "      " + statement);
				    wiki_data.content[wiki_data.content_entries++] = statement;				
				}
			}
			if (header == "Also known as")
			{
				string[] names = data.Split(',');
				for (int i = 0; i < names.Length; i++)
				{				
				    statement = subject + " is also known as " + names[i].Trim();
				    //Console.WriteLine(header + "      " + statement);
				    wiki_data.content[wiki_data.content_entries++] = statement;				
				}
			}
			if (header == "School/tradition")
			{
				string[] names = data.Split(',');
				for (int i = 0; i < names.Length; i++)
				{				
				    statement = subject + " follows the school of thought " + names[i].Trim();
				    //Console.WriteLine(header + "      " + statement);
				    wiki_data.content[wiki_data.content_entries++] = statement;				
				}
			}
			if (header == "Main interests")
			{
				string[] names = data.Split(',');
				for (int i = 0; i < names.Length; i++)
				{				
				    statement = subject + " is interested in " + names[i].Trim();
				    //Console.WriteLine(header + "      " + statement);
				    wiki_data.content[wiki_data.content_entries++] = statement;				
				}
			}
			if (header == "Notable ideas")
			{
				string[] names = data.Split(',');
				for (int i = 0; i < names.Length; i++)
				{				
				    statement = subject + " has ideas in " + names[i].Trim();
				    //Console.WriteLine(header + "      " + statement);
				    wiki_data.content[wiki_data.content_entries++] = statement;				
				}
			}
			if (header == "Influenced")
			{
				string[] names = data.Split(',');
				for (int i = 0; i < names.Length; i++)
				{	
					if (names[i].Trim() != subject)
					{
				        //statement = names[i].Trim() + " was influenced by " + subject;						
						statement = subject + " influenced " + names[i].Trim();
				        //if (names[i].Contains("Adam")) Console.WriteLine(header + "      " + statement);
				        wiki_data.content[wiki_data.content_entries++] = statement;				
					}
				}
			}
			if (header == "Government")
			{
				string[] names = data.Split(',');
				for (int i = 0; i < names.Length; i++)
				{				
				    statement = subject + " has the government type " + names[i].Trim();
				    //Console.WriteLine(header + "      " + statement);
				    wiki_data.content[wiki_data.content_entries++] = statement;				
				}
			}
			if ((header == "Politicalparty") ||
			    (header == "Politicalparty"))
			{
				string[] names = data.Split(',');
				for (int i = 0; i < names.Length; i++)
				{				
				    statement = subject + " was a member of the " + names[i].Trim() + " party";
				    //Console.WriteLine(header + "      " + statement);
				    wiki_data.content[wiki_data.content_entries++] = statement;				
				}
			}			
			if (header == "Alamater")
			{
				string[] names = data.Split(',');
				for (int i = 0; i < names.Length; i++)
				{				
				    statement = subject + " attended " + names[i].Trim();
				    //Console.WriteLine(header + "      " + statement);
				    wiki_data.content[wiki_data.content_entries++] = statement;				
				}
			}			
			if ((header.ToLower() == "vicepresident") ||
			    (header.ToLower() == "vice president"))
			{
				string[] names = data.Split(',');
				for (int i = 0; i < names.Length; i++)
				{				
					names[i] = names[i].Trim();
					if ((names[i].Length > 3) && (names[i].ToLower() != "none"))
					{
				        statement = subject + " had the vice president " + names[i];
				        //Console.WriteLine(header + "      " + statement);
				        wiki_data.content[wiki_data.content_entries++] = statement;				
					}
				}
			}			
			if ((header == "Official languages") ||
			    (header == "Official language(s)") ||
			    (header == "Official language"))
			{
				string[] names = data.Split(',');
				for (int i = 0; i < names.Length; i++)
				{				
				    statement = subject + " has the official language " + names[i].Trim();
				    //Console.WriteLine(header + "      " + statement);
				    wiki_data.content[wiki_data.content_entries++] = statement;				
				}
			}
			if (header == "Demonym")
			{
				string[] names = data.Split(',');
				for (int i = 0; i < names.Length; i++)
				{				
				    statement = names[i].Trim() + "s live in " + subject;
				    //Console.WriteLine(header + "      " + statement);
				    wiki_data.content[wiki_data.content_entries++] = statement;	
					
				    statement = names[i].Trim() + " is a type of nationality";
				    //Console.WriteLine(header + "      " + statement);
				    wiki_data.content[wiki_data.content_entries++] = statement;				
				}
			}
			if ((header == "Occupation(s)") ||
			    (header == "Occupation"))
			{
				string[] names = data.Split(',');
				for (int i = 0; i < names.Length; i++)
				{				
				    statement = subject + " has the occupation " + names[i].Trim();
				    //Console.WriteLine(header + "      " + statement);
				    wiki_data.content[wiki_data.content_entries++] = statement;				
				}
			}
			if ((header == "Genre(s)") ||
			    (header == "Genre"))
			{
				string[] names = data.Split(',');
				for (int i = 0; i < names.Length; i++)
				{				
				    statement = subject + " has the genre " + names[i].Trim();
				    //Console.WriteLine(header + "      " + statement);
				    wiki_data.content[wiki_data.content_entries++] = statement;				
				}
			}
			if ((header == "Instrument(s)") ||
			    (header == "Instrument"))
			{
				string[] names = data.Split(',');
				for (int i = 0; i < names.Length; i++)
				{				
					names[i] = names[i].Trim();
					if (names[i].ToLower() != "vocals")
					{
				        statement = subject + " plays the instrument " + names[i];
					}
					else
					{
						statement = subject + " is a singer";
					}
				    //Console.WriteLine(header + "      " + statement);
				    wiki_data.content[wiki_data.content_entries++] = statement;				
				}
			}
			if ((header.Contains("Associated act")) ||
			    (header.Contains("Related")))
			{
				string[] names = data.Split(',');
				for (int i = 0; i < names.Length; i++)
				{
				    statement = subject + " is associated with " + names[i].Trim();
				    //Console.WriteLine(header + "      " + statement);
				    wiki_data.content[wiki_data.content_entries++] = statement;				
				}
			}
		}
예제 #3
0
        private void AddMindpixel(
		    string mindpixel_filename,
		    WikipediaData wiki_data,
		    int entry_number,
		    ref bool man_woman)
        {
            string question;
            string title, category, bisect, question_header;
            string[] words;
            string[] titlewords;
            bool possible_person;
            int i;
			string[] content = new string[3];
			ArrayList to_be_added = new ArrayList();
			
			content[0] = wiki_data.content[entry_number];
			content[1] = "";
			content[2] = "";
			
			int pos2=-1;
			int pos = wiki_data.content[entry_number].IndexOf(", ");
			if (pos > -1)
			{
			    pos2 = wiki_data.content[entry_number].IndexOf(", ", pos+1);
			}
			
			if ((pos < 50) && (pos > -1) && (pos2 > pos))
			{
				string str = wiki_data.content[entry_number].Substring(pos+1, pos2 - pos).Trim();				
				string[] str2 = str.Split(' ');
				if (str2.Length > 0)
				{
					string content1 = "";
					if ((str2[0].StartsWith("a")) ||
					    (str2[0].StartsWith("otherwise")) ||
					    (str2[0] == "the") ||
					    (str2[0].EndsWith("ly")))
					{
						content1 = wiki_data.content[entry_number].Substring(0, pos) + " is " + str;
					}
					else
					{
						if ((str2[0].StartsWith("or")) ||
						    (str2.Length == 1))
						{
							content1 = wiki_data.content[entry_number].Substring(0, pos) + " is otherwise known as " + str;
						}
						else
						{
						    content1 = wiki_data.content[entry_number].Substring(0, pos2);
						}
					}
					content[1] = ReplaceWord(content1,",","");
					string content2 = wiki_data.content[entry_number].Substring(0, pos) + wiki_data.content[entry_number].Substring(pos2);
					content[2] = ReplaceWord(content2,",","");
					//Console.WriteLine("Content1: " + content[1]);
					//Console.WriteLine("Content2: " + content[2]);
				}
			}
			
			for (int s = 0; s < 3; s++)
			{
				if (content[s] != "")
				{
					//if (content[1] != "") Console.WriteLine("question1: " + content[s]);					
					
		            question = ParseAsQuestion(wiki_data.Title, content[s]);
		            current_question = question;
					
					//if (content[1] != "") Console.WriteLine("question2: " + question);
		
		            title = wiki_data.Title.ToLower();
		            title = removeBrackets(title);
		
		            if (!title.Contains("list of "))
		            {
		                if ((question != "") && (!question.Contains("disambiguation")) && (!question.Contains("cleanup"))
		                     && (!question.Contains("wikipedia") && (!question.Contains("wikified")))
		                     && (!question.Contains("article lacking") && (!question.Contains("article needing")))
		                    )
						{
							if (!ContainString(to_be_added, question)) to_be_added.Add(question);
						}
		
		                possible_person = false;
		                titlewords = title.Split(' ');                                
		                for (i = 0; i < wiki_data.categories.Count; i++)
		                {
		                    category = (String)wiki_data.categories[i];
							Console.WriteLine("CATEGORY: " + category);
		                    if ((category != "") && (category != "disambiguation") &&
		                        (!category.Contains("cleanup")) && (!category.Contains("wikipedia") && (!category.Contains("wikified"))) &&
		                        (!category.Contains("article lacking") && (!category.Contains("article needing")) && (!category.Contains("article pending")))
		                        )
		                    {
		                        category = category.ToLower();
		                        words = category.Split(' ');                                
		
		                        bisect = "a type of";
		                        question_header = "Is ";
		
		                        if (category.Contains(" birth"))
		                        {
		                            question_header = "Was ";
		                            bisect = "born in";
		                            category = words[0];
		                            possible_person = true;
		                        }
		
		                        if (category.Contains(" death"))
		                        {
		                            question_header = "Did ";
		                            bisect = "die in";
		                            category = words[0];
		                            possible_person = true;
		                        }
		
		                        if (category.Contains("suicide"))
		                        {
		                            question_header = "Did ";
		                            bisect = "commit";
		                            category = words[0];
		                            possible_person = true;
		                        }
		
		                        if (category.Contains("people")) possible_person = true;
		                        if (category.Contains("living people")) category = "";
		                        if (category.Contains("list of ")) category = "";
		
		                        if ((category != "") && (category != "stub") && (!category.Contains("disambiguation")))
		                        {
		                            question = question_header + title + " " + bisect + " " + category + "?";
									if (!ContainString(to_be_added, question)) to_be_added.Add(question);
		                            if (current_question == "") current_question = question;
		                        }
		                    }
		                }
		
		                if ((!man_woman) &&
						    (((titlewords.Length == 2) ||
						    ((possible_person) && (titlewords.Length > 0)))))
		                {
		                    if (names.IsMaleName(titlewords[0]))
		                    {
		                        question = "Is " + title + " a man?";
								if (!ContainString(to_be_added, question)) to_be_added.Add(question);
								man_woman = true;
		                    }
		                    else
		                    {
		                        if (names.IsFemaleName(titlewords[0]))
		                        {
		                            question = "Is " + title + " a woman?";
									if (!ContainString(to_be_added, question)) to_be_added.Add(question);
									man_woman = true;
		                        }
		                    }
		                }
		                
		            }
  			        
				}
			}
			
			for (i = 0; i < to_be_added.Count; i++)
			{
				question = (string)to_be_added[i];
				string[] s2 = question.Split(' ');
				if (s2[s2.Length-1].Length > 3)
				{
					if ((!question.EndsWith(" born?")) &&
					    (!question.EndsWith(" and?")) &&
					    (!question.EndsWith(" usually?")) &&
					    (!question.EndsWith(" was?")) &&
					    (!question.EndsWith(" small?")) &&
					    (!question.EndsWith(" large?")) &&
					    (!question.EndsWith(" under?"))
					    )
					    SaveQuestion(mindpixel_filename, question);
				}
			}	
            
        }
예제 #4
0
        private void ProcessWikipediaEntry(
		    string html,
		    string mindpixel_filename)
        {
            int pos, pos2, i;
            string firstParagraph;
            string[] sentence;
            string[] words;
            string category;
			bool man_woman = false;
			string html_original = html;

			WikipediaData wiki_data = new WikipediaData();
            wiki_data.Title = "";
            wiki_data.content_entries = 0;
            wiki_data.categories.Clear();
			
            pos = html.IndexOf("firstHeading");
            if (pos > 0)
            {
                html = html.Substring(pos + 14);
                pos2 = html.IndexOf("<");
                if (pos2 > 0)
                {
                    //get the title of the article
                    wiki_data.Title = html.Substring(0, pos2);					
 		            wiki_data.Title = ReplaceWord(wiki_data.Title, "&ndash;", "-");
			        wiki_data.Title = ReplaceWord(wiki_data.Title, "&#39;","'");								
					
					//Console.WriteLine("Title: " + wiki_data.Title);
					
                    html = html.Substring(pos2);
                    pos = html.IndexOf("start content");
                    if (pos > 0)
                    {
                        //get the content
                        html = html.Substring(pos);
						
						
						int pos3 = html.IndexOf("<table");
						
                        pos = html.IndexOf("<p>");
						
						if ((pos3 != -1) && (pos3 < pos))
						{
							// skip table
							pos = html.IndexOf("</table>");
							html = html.Substring(pos);
							pos = html.IndexOf("<p>");
						}
						
                        if (pos > 0)
                        {
                            html = html.Substring(pos+3);
							pos -= (pos+3);
							//Console.WriteLine(html);
                            //pos2 = html.IndexOf("</p>");
							if (pos+20 < html.Length) 
								pos2 = html.IndexOf(". ", pos+20);
							else
								pos2 = -1;
							if ((pos2 == -1) && (pos+10 < html.Length)) pos2 = html.IndexOf(". ", pos+10);
							if ((pos2 == -1) && (pos+20 < html.Length)) pos2 = html.IndexOf(".", pos+20);
							if ((pos2 == -1) && (pos+10 < html.Length)) pos2 = html.IndexOf(".", pos+10);
							if (pos2 == -1) pos2 = html.IndexOf(".");
                            if (pos2-3 > 0)
                            {
                                firstParagraph = html.Substring(0, pos2);                                
								firstParagraph = RemoveFormatting(firstParagraph);
																								
                                sentence = firstParagraph.Split('.');
                                if (sentence.Length > 0)
                                {
									int max = sentence.Length;
									if (max > 1000) max = 1000;
									for (int s = 0; s < max; s++)
									{
                                        wiki_data.content[wiki_data.content_entries++] = sentence[s];
									}
									
                                }
                            }
                        }
                    }

                    // look for a categories                    
                    pos = 1;
                    while (pos > 0)
                    {
                        pos = html.IndexOf(Convert.ToString((char)34) + "Category:");
                        if (pos > 0)
                        {
                            pos += 10;
                            html = html.Substring(pos);
                            pos2 = html.IndexOf(">");
                            if (pos2 > 0)
                            {
                                category = html.Substring(0, pos2-1);
                                if (!category.Contains(" stub"))
                                {

                                    words = category.Split(' ');
                                    for (i = 0; i < words.Length; i++)
                                    {
                                        if (words[i].Length > 4)
                                        {
                                            if (words[i].Substring(words[i].Length - 3, 3) == "ies")
                                                words[i] = words[i].Substring(0, words[i].Length - 3) + "y";
                                        }
                                        if (words[i].Substring(words[i].Length - 1, 1) == "s")
                                            words[i] = words[i].Substring(0, words[i].Length - 1);
                                    }
                                    category = "";
                                    for (i = 0; i < words.Length; i++)
                                    {
                                        category += words[i];
                                        if (i < words.Length) category += " ";
                                    }

                                    wiki_data.categories.Add(category);
                                    html = html.Substring(pos2);
                                }
                            }
                        }
                    }
                }
            }
									
			for (i = 0; i < wiki_data.content_entries; i++)
			{				
			    AddMindpixel(mindpixel_filename, wiki_data, i, ref man_woman);
				wiki_data.categories.Clear();
			}
			
			wiki_data.content_entries = 0;
			ExtractHeaderData(html_original, wiki_data, man_woman);
			
			for (i = 0; i < wiki_data.content_entries; i++)
			{				
			    AddMindpixel(mindpixel_filename, wiki_data, i, ref man_woman);
				wiki_data.categories.Clear();
			}
        }
예제 #5
0
		private void ExtractHeaderData(
		    string html, 
		    WikipediaData wiki_data, 
		    bool probably_person)
		{		
			int pos4, pos5, pos6, pos3,pos2, pos = 0;
			while (pos != -1)
			{
				pos = html.IndexOf("<th", pos+1);
				if (pos > -1)
				{
					pos2 = html.IndexOf(">", pos);
					if (pos2 > -1)
					{
						//string test_str = html.Substring(pos, pos2-pos+1);
						//Console.WriteLine("test " + test_str);
						pos3 = html.IndexOf("</th>", pos2+1);
						if (pos3 != -1)
						{
							string header_str = html.Substring(pos2+1, pos3-pos2-1);
							//Console.WriteLine(header_str);
							//if (header_str.Contains("Birth")) Console.WriteLine("Birth: " + header_str);
							header_str = RemoveFormatting(header_str);
							if (header_str == "E&deg") header_str = "East";
							if (header_str == "N&deg") header_str = "Nort";
							header_str = ReplaceWord(header_str, "&amp;", "");
							header_str = ReplaceWord(header_str, "&nbsp;", "");
							header_str = ReplaceWord(header_str, "&ndash;", "-");
							header_str = ReplaceWord(header_str, "&#39;","'");
							header_str = ReplaceWord(header_str, "&deg;F","degrees farenheit");
							header_str = header_str.Trim();
							if (header_str.StartsWith("-")) header_str = header_str.Substring(1).Trim();
							
							if ((!header_str.Contains("&")) && (header_str != ""))
							{			
								if (!header_str.ToLower().Contains("scientific class"))
								{
									string header_data = "";
									int tries = 3;
									while ((header_data == "") && (tries >= 0))
									{
										tries--;
										pos4 = html.IndexOf("<td", pos3);
										if (pos4 > -1)
										{
											pos5 = html.IndexOf(">", pos4);
											if (pos5 > -1)
											{
											    pos6 = html.IndexOf("</td>", pos5);
											    if (pos6 > -1)
											    {
												    header_data = RemoveFormatting(html.Substring(pos5+1, pos6-pos5-1));
													header_data = ReplaceWord(header_data, "&amp;", "");
													header_data = ReplaceWord(header_data, "&quot;", "");
													header_data = ReplaceWord(header_data, "&nbsp;", "");
													header_data = ReplaceWord(header_data, "&ndash;", "-");
													header_data = ReplaceWord(header_data, "&#39;","'");
													header_data = ReplaceWord(header_data, "&sup2;","^2");
													header_data = ReplaceWord(header_data, "&deg;F","degrees farenheit");
													header_data = header_data.Trim();											
													if (header_data == "#") header_data = "";
													if ((!header_data.Contains("&")) && (header_data != ""))
													{
														//Console.WriteLine("Title: " + wiki_data.Title);
														//Console.WriteLine("  Header: " + header_str);
													    //Console.WriteLine("    Data: " + header_data);
														AddHeaderQuestion(wiki_data, wiki_data.Title, header_str, header_data, probably_person);
													}
												}
											}
											pos3 = pos4+1;
										}
									}
								}
							}							
							
							//int pos3 = html.IndexOf("<th>", pos);
							//pos = pos3;
						}
						//pos = pos2;
					}
				}
			}
		}