setDates() публичный Метод

public setDates ( String _input, String dateExtractPattern, String eventDateLine ) : void
_input String
dateExtractPattern String
eventDateLine String
Результат void
Пример #1
0
		public void initParsing(String path)
		{
			
			//XDocument input = XDocument.Load(@"..\..\..\..\Data\input.xml");
			//XDocument input = XDocument.Load(@"D:\downNew\wiki\enwiki-latest-pages-articles1.xml-p000000010p000010000");
			XDocument input = XDocument.Load(path);
			// Console.WriteLine(booksFromFile);


			//CalendarEvent ce = new CalendarEvent(input);
			//var pages = input.Root.Descendants("page");
			XElement eee = input.Element("siteinfo");
			IEnumerable<XElement> pages = input.Element("mediawiki").Elements("page");

			Console.WriteLine(pages.Count().ToString());
			pagesCount = pages.Count();
			
			//pagesCountTextBox.Text = pages.Count().ToString();		


			// matching infobox from page text
			
			
			pageCounter = 0;
			foreach(XElement page in pages)
			{
				String infoboxPattern = "{{Infobox (.+\n)+}}";
				MatchCollection infoboxes = System.Text.RegularExpressions.Regex.Matches(page.Element("revision").Element("text").Value, infoboxPattern, System.Text.RegularExpressions.RegexOptions.Multiline);

				//99% just one iteration
				foreach (Match infobox in infoboxes)
				{
					String datePattern = "[A-Za-z_]+date\\s+=.+";//TODO optimize!
					MatchCollection dateLine = Regex.Matches(infobox.Value,datePattern);
					
					//statisticaly 2 iterations
					foreach( Match n in dateLine)
					{
						//TODO check release_date - just year 
						//pub_date


					   // Console.WriteLine(n.Value);
						//dateFormat is Y|MM|DD or 'BCE'|Y 
						String dateExtractPattern = ""
							//"([0-9]{0,}\\|[0-1]{0,1}[0-9]{1}\\|[0-3]{0,1}[0-9]{1})|(BCE\\|[0-9]+)"
							+ @"([0-9]{1,4}\|[0-1]{0,1}[0-9]{1}\|[0-3]{0,1}[0-9]{1})" //Y|MM|DD
							+ @"|((?:(BC)?|(BCE)?)\|[0-9]+(\|\d\|\d){0})" //'BCE'|Y 
							+ @"|((?:([0-3]?[0-9]) ((?:January)?|(?:February)?|(?:March)?|(?:April)?|(?:May)?|(?:June)?|(?:July)?|(?:August)?|(?:September)?|(?:October)?|(?:November)?|(?:December)?) )((\d+(\sBCE)?|(\sBC)?)|((AD\s)?\d{1,})))" //3 July 2001
							+ @"|((?:(?:January)?|(?:February)?|(?:March)?|(?:April)?|(?:May)?|(?:June)?|(?:July)?|(?:August)?|(?:September)?|(?:October)?|(?:November)?|(?:December)?) (?:([0-3]?[0-9])), [0-9]{0,}\s?(?:(BC)?|(BCE)?))"// July 3, 2001
							+ @"|((?:([0-3]?[0-9]) ((?:January)?|(?:February)?|(?:March)?|(?:April)?|(?:May)?|(?:June)?|(?:July)?|(?:August)?|(?:September)?|(?:October)?|(?:November)?|(?:December)?) )(?:(AD)?)\s[0-9]{1,})"; //3 July AD 2001
						MatchCollection dateExtract = Regex.Matches(n.Value, dateExtractPattern);
						//foreach (Match o in dateExtract)
						//{
						//	allEvents.AddLast(new CalendarEvent(page));
						//	allEvents.Last.Value.setDates(o.Value, dateExtractPattern);
						//	//Console.WriteLine(o.Value);
						//}

						//dateExtractPattern = @"|((?:(BC)?|(BCE)?)\|[0-9]+)"; //'BCE'|Y 
						//dateExtract = Regex.Matches(n.Value, dateExtractPattern);
						//foreach (Match o in dateExtract)
						//{
						//	allEvents.AddLast(new CalendarEvent(page));
						//	allEvents.Last.Value.setDates(o.Value, dateExtractPattern);
						//	//Console.WriteLine(o.Value);
						//}
						////+ "|([0-3]{0,1}[0-9]{1} \b(?:(January)?|(February)?|(March)?|(April)?|(May)?|(June)?|(July)?|(August)?|(September)?|(October)?|(November)?|(December)?) [0-9]{0,}\\s?(?:(BC)?|(BCE)?))" 
						////+ "(?:([0-3]{0,1}[0-9]{1}) (\b(?:January)?|(?:February)?|(?:March)?|(?:April)?|(?:May)?|(?:June)?|(?:July)?|(?:August)?|(?:September)?|(?:October)?|(?:November)?|(?:December)?) ([0-9]{0,})(\s?(?:BC)?|(?:BCE)?))";
						//dateExtractPattern = @"|((?:([0-3]?[0-9]) ((?:January)?|(?:February)?|(?:March)?|(?:April)?|(?:May)?|(?:June)?|(?:July)?|(?:August)?|(?:September)?|(?:October)?|(?:November)?|(?:December)?) )[0-9]{0,}\s?(?:(BC)?|(BCE)?))"; //3 July 2001
						//dateExtract = Regex.Matches(n.Value, dateExtractPattern);
						//foreach (Match o in dateExtract)
						//{
						//	allEvents.AddLast(new CalendarEvent(page));
						//	allEvents.Last.Value.setDates(o.Value, dateExtractPattern);
						//	//Console.WriteLine(o.Value);
						//}

						//dateExtractPattern = @"|(?:(?:January)?|(?:February)?|(?:March)?|(?:April)?|(?:May)?|(?:June)?|(?:July)?|(?:August)?|(?:September)?|(?:October)?|(?:November)?|(?:December)?) (?:([0-3]?[0-9])), [0-9]{0,}\s?(?:(BC)?|(BCE)?)"; // July 3, 2001
						//dateExtract = Regex.Matches(n.Value, dateExtractPattern);
						// Console.WriteLine("dates formated Y|MM|DD or 'BCE'|Y ");
						//1 iteration
						foreach (Match o in dateExtract)
						{
							CalendarEvent extractedEvent = new CalendarEvent(page);
							try
							{
								extractedEvent.setDates(o.Value, dateExtractPattern,n.Value);											
							}
							catch (DataMisalignedException exc) 
							{
								Console.WriteLine(infobox);
								Console.WriteLine( exc.Message);
								continue;
							}

							allEvents.AddLast(extractedEvent);

							Dictionary<long, string> indexing = new Dictionary<long, string>();
							indexing.Add(extractedEvent.dateId, extractedEvent.title + ";" + extractedEvent.eventType );
							index.Index(indexing);

							if(allDays.Keys.Contains(extractedEvent.dateId))
							{
								allDays[extractedEvent.dateId].Add(extractedEvent);


							}
							else
							{

								allDays.Add(extractedEvent.dateId, new DayEventCollection(extractedEvent.date.Day, extractedEvent.date.Month, extractedEvent.date.Year, extractedEvent));
							}


							//Console.WriteLine(o.Value);
							eventsCount = allEvents.Count;
						}
						/*
						 * |birth_date={{birth date|1809|2|12}} 
						 * | birth_date = {{birth date|mf=yes|1905|2|2}}
						 * birth_date  = {{birth date and age|1947|04|01|df=y}} 
						 * | birth_date = {{Birth date|df=yes|1885|4|3}}
						 * | birth_date = {{birth date and age|1970|04|29}}
						 * | birth_date = {{birth date|df=yes|1894|7|26}}
						 * |birth_date= {{Birth date|1803|2|2}}
						 */
					}
				
				}
				pageCounter++;
				//this.pagesProgresBar.Value = (int)((100 * pageCounter) / pages.Count());
				
				
			}
			Console.WriteLine(allEvents.Count);

		}
Пример #2
0
        public void initParsing(String path)
        {
            //XDocument input = XDocument.Load(@"..\..\..\..\Data\input.xml");
            //XDocument input = XDocument.Load(@"D:\downNew\wiki\enwiki-latest-pages-articles1.xml-p000000010p000010000");
            XDocument input = XDocument.Load(path);
            // Console.WriteLine(booksFromFile);


            //CalendarEvent ce = new CalendarEvent(input);
            //var pages = input.Root.Descendants("page");
            XElement eee = input.Element("siteinfo");
            IEnumerable <XElement> pages = input.Element("mediawiki").Elements("page");

            Console.WriteLine(pages.Count().ToString());
            pagesCount = pages.Count();

            //pagesCountTextBox.Text = pages.Count().ToString();


            // matching infobox from page text


            pageCounter = 0;
            foreach (XElement page in pages)
            {
                String          infoboxPattern = "{{Infobox (.+\n)+}}";
                MatchCollection infoboxes      = System.Text.RegularExpressions.Regex.Matches(page.Element("revision").Element("text").Value, infoboxPattern, System.Text.RegularExpressions.RegexOptions.Multiline);

                //99% just one iteration
                foreach (Match infobox in infoboxes)
                {
                    String          datePattern = "[A-Za-z_]+date\\s+=.+";           //TODO optimize!
                    MatchCollection dateLine    = Regex.Matches(infobox.Value, datePattern);

                    //statisticaly 2 iterations
                    foreach (Match n in dateLine)
                    {
                        //TODO check release_date - just year
                        //pub_date


                        // Console.WriteLine(n.Value);
                        //dateFormat is Y|MM|DD or 'BCE'|Y
                        String dateExtractPattern = ""
                                                                                                                                                                                                                                                                          //"([0-9]{0,}\\|[0-1]{0,1}[0-9]{1}\\|[0-3]{0,1}[0-9]{1})|(BCE\\|[0-9]+)"
                                                    + @"([0-9]{1,4}\|[0-1]{0,1}[0-9]{1}\|[0-3]{0,1}[0-9]{1})"                                                                                                                                                             //Y|MM|DD
                                                    + @"|((?:(BC)?|(BCE)?)\|[0-9]+(\|\d\|\d){0})"                                                                                                                                                                         //'BCE'|Y
                                                    + @"|((?:([0-3]?[0-9]) ((?:January)?|(?:February)?|(?:March)?|(?:April)?|(?:May)?|(?:June)?|(?:July)?|(?:August)?|(?:September)?|(?:October)?|(?:November)?|(?:December)?) )((\d+(\sBCE)?|(\sBC)?)|((AD\s)?\d{1,})))" //3 July 2001
                                                    + @"|((?:(?:January)?|(?:February)?|(?:March)?|(?:April)?|(?:May)?|(?:June)?|(?:July)?|(?:August)?|(?:September)?|(?:October)?|(?:November)?|(?:December)?) (?:([0-3]?[0-9])), [0-9]{0,}\s?(?:(BC)?|(BCE)?))"         // July 3, 2001
                                                    + @"|((?:([0-3]?[0-9]) ((?:January)?|(?:February)?|(?:March)?|(?:April)?|(?:May)?|(?:June)?|(?:July)?|(?:August)?|(?:September)?|(?:October)?|(?:November)?|(?:December)?) )(?:(AD)?)\s[0-9]{1,})";                   //3 July AD 2001
                        MatchCollection dateExtract = Regex.Matches(n.Value, dateExtractPattern);
                        //foreach (Match o in dateExtract)
                        //{
                        //	allEvents.AddLast(new CalendarEvent(page));
                        //	allEvents.Last.Value.setDates(o.Value, dateExtractPattern);
                        //	//Console.WriteLine(o.Value);
                        //}

                        //dateExtractPattern = @"|((?:(BC)?|(BCE)?)\|[0-9]+)"; //'BCE'|Y
                        //dateExtract = Regex.Matches(n.Value, dateExtractPattern);
                        //foreach (Match o in dateExtract)
                        //{
                        //	allEvents.AddLast(new CalendarEvent(page));
                        //	allEvents.Last.Value.setDates(o.Value, dateExtractPattern);
                        //	//Console.WriteLine(o.Value);
                        //}
                        ////+ "|([0-3]{0,1}[0-9]{1} \b(?:(January)?|(February)?|(March)?|(April)?|(May)?|(June)?|(July)?|(August)?|(September)?|(October)?|(November)?|(December)?) [0-9]{0,}\\s?(?:(BC)?|(BCE)?))"
                        ////+ "(?:([0-3]{0,1}[0-9]{1}) (\b(?:January)?|(?:February)?|(?:March)?|(?:April)?|(?:May)?|(?:June)?|(?:July)?|(?:August)?|(?:September)?|(?:October)?|(?:November)?|(?:December)?) ([0-9]{0,})(\s?(?:BC)?|(?:BCE)?))";
                        //dateExtractPattern = @"|((?:([0-3]?[0-9]) ((?:January)?|(?:February)?|(?:March)?|(?:April)?|(?:May)?|(?:June)?|(?:July)?|(?:August)?|(?:September)?|(?:October)?|(?:November)?|(?:December)?) )[0-9]{0,}\s?(?:(BC)?|(BCE)?))"; //3 July 2001
                        //dateExtract = Regex.Matches(n.Value, dateExtractPattern);
                        //foreach (Match o in dateExtract)
                        //{
                        //	allEvents.AddLast(new CalendarEvent(page));
                        //	allEvents.Last.Value.setDates(o.Value, dateExtractPattern);
                        //	//Console.WriteLine(o.Value);
                        //}

                        //dateExtractPattern = @"|(?:(?:January)?|(?:February)?|(?:March)?|(?:April)?|(?:May)?|(?:June)?|(?:July)?|(?:August)?|(?:September)?|(?:October)?|(?:November)?|(?:December)?) (?:([0-3]?[0-9])), [0-9]{0,}\s?(?:(BC)?|(BCE)?)"; // July 3, 2001
                        //dateExtract = Regex.Matches(n.Value, dateExtractPattern);
                        // Console.WriteLine("dates formated Y|MM|DD or 'BCE'|Y ");
                        //1 iteration
                        foreach (Match o in dateExtract)
                        {
                            CalendarEvent extractedEvent = new CalendarEvent(page);
                            try
                            {
                                extractedEvent.setDates(o.Value, dateExtractPattern, n.Value);
                            }
                            catch (DataMisalignedException exc)
                            {
                                Console.WriteLine(infobox);
                                Console.WriteLine(exc.Message);
                                continue;
                            }

                            allEvents.AddLast(extractedEvent);

                            Dictionary <long, string> indexing = new Dictionary <long, string>();
                            indexing.Add(extractedEvent.dateId, extractedEvent.title + ";" + extractedEvent.eventType);
                            index.Index(indexing);

                            if (allDays.Keys.Contains(extractedEvent.dateId))
                            {
                                allDays[extractedEvent.dateId].Add(extractedEvent);
                            }
                            else
                            {
                                allDays.Add(extractedEvent.dateId, new DayEventCollection(extractedEvent.date.Day, extractedEvent.date.Month, extractedEvent.date.Year, extractedEvent));
                            }


                            //Console.WriteLine(o.Value);
                            eventsCount = allEvents.Count;
                        }

                        /*
                         * |birth_date={{birth date|1809|2|12}}
                         * | birth_date = {{birth date|mf=yes|1905|2|2}}
                         * birth_date  = {{birth date and age|1947|04|01|df=y}}
                         * | birth_date = {{Birth date|df=yes|1885|4|3}}
                         * | birth_date = {{birth date and age|1970|04|29}}
                         * | birth_date = {{birth date|df=yes|1894|7|26}}
                         * |birth_date= {{Birth date|1803|2|2}}
                         */
                    }
                }
                pageCounter++;
                //this.pagesProgresBar.Value = (int)((100 * pageCounter) / pages.Count());
            }
            Console.WriteLine(allEvents.Count);
        }