private static async Task <Schedule> GetSchedule(HttpClient client, ScheduleInfo si) { VerbosePrint("Obtaining schedule for stop"); //encode cyrillic characters in route name, if they are present var name = si.RouteName; var encodedRoute = EncodeCyrillicUri(name); var uri = Uri.GetUri(si.GetTransportTypeString(), encodedRoute, si.DaysOfOperation.ToString(), si.GetDirectionCodeString(), si.StopNumber.ToString()); var response = await GetHttpResponse(client, uri); if (response.Length == 0) { return(null); } return(ScheduleParser.Parse(response, si)); }
//I know that parsing HTML by regex is a bad idea, but objective is not to use third-party parsers public static Schedule Parse(string htmlData, ScheduleInfo si) { var schedule = new Schedule(si); var index = 0; //Current position of htmlData processing (shifts if some item is found and parsed) var searchIndex = 0; //Curent position of search ahead of index //<span class=\"hour\">(\d+)</span></td><td align=.*>(.*)</td> var hourRegex = new Regex(HourRegexPattern); //<span class="minutes" >02</span> //<span class="minutes" style="color: red; font-weight: bold;">37</span><br> var minuteRegex = new Regex(MinutesRegexPattern); //Starting routine searchIndex = htmlData.IndexOf(ValidityTimeSearchStr, index); if (searchIndex == -1) { throw new Exception("Validity time not found!"); } index = searchIndex + ValidityTimeSearchStr.Length; searchIndex = htmlData.IndexOf(TagBeginning, index); if (searchIndex == -1) { throw new Exception("Validity time border not found!"); } var validityStr = htmlData.Substring(index, searchIndex - index); var date = ParseDateTime(validityStr); schedule.ValidityTime = date; //Console.WriteLine("Schedule valid from: " + date.ToString()); index = searchIndex; //Iterative hours and minutes parsing do { //Parsing hours searchIndex = htmlData.IndexOf(HourSearchStr, index); if (searchIndex == -1) { break; } index = searchIndex; searchIndex = htmlData.IndexOf(TdClosingTag, index); //Grey hours will be ignored because they doesn't match to regexp sbyte hour = -1; var hourStr = htmlData.Substring(index, searchIndex - index); var hourMatch = hourRegex.Match(hourStr); if (hourMatch.Length == 0) { throw new Exception("Failed to find hour info!"); } else { hour = Convert.ToSByte(hourMatch.Groups[1].Value); } //Getting substring for all minutes in hour, then searching for all of it //Parsing minutes searchIndex = htmlData.IndexOf(MinutesSearchStr, index); index = searchIndex; var minutesBorderIndex = htmlData.IndexOf(TdClosingTag, index); var allMinutesStr = htmlData.Substring(index, minutesBorderIndex - index); var minuteIndex = 0; //local varibles for searching inside minutes substring var minuteSearchIndex = 0; while (minuteSearchIndex != -1) { minuteSearchIndex = allMinutesStr.IndexOf(SpanClosingTag, minuteIndex); if (minuteSearchIndex == -1) { break; } var minuteStr = allMinutesStr.Substring(minuteIndex, minuteSearchIndex - minuteIndex + SpanClosingTag.Length); minuteIndex = minuteSearchIndex + SpanClosingTag.Length; var minuteMatch = minuteRegex.Match(minuteStr); if (minuteMatch.Length == 0) { throw new Exception("Failed to find minute info!"); } else { if (hour == -1) { throw new Exception("Hours parser f****d up!"); } var minute = Convert.ToSByte(minuteMatch.Groups[3].Value); var specialRoute = minuteMatch.Groups[2].Value; if (specialRoute.Length > 1) { var type = RouteTypeProvider.GetRouteType(specialRoute); schedule.AddEntry(new ScheduleEntry(hour, minute, type)); //special route } else { schedule.AddEntry(new ScheduleEntry(hour, minute)); //regular route } } } //all minutes for current hour found, skipping to next hour index = minutesBorderIndex; searchIndex = index; }while (searchIndex > 0); //Parsing the legend searchIndex = htmlData.IndexOf(LegendHeaderStr, index); if (searchIndex != 0) { index = searchIndex + LegendHeaderStr.Length; searchIndex = htmlData.IndexOf(TdClosingTag, index); if (searchIndex != 0) { var legendData = htmlData.Substring(index, searchIndex - index); //.*? for non-greedy match instead of greedy .* //const string noColorsRegexPattern = "<p class=\"helpfile\"><b>(.*)<\\/b>(.*)<\\/p>"; //regex pattern without colors: <p class="helpfile"><b>(.*)<\/b>(.*)<\/p> //group1: bold text, group2: non-bold text (check for empty!) //var noColorsRegex = new Regex(noColorsRegexPattern); //var matches = noColorsRegex.Matches(legendData); //Console.WriteLine("Matching non-colored legend..."); //if (matches.Count == 0) // Console.WriteLine("Non-colored regex not matched!"); //else //{ // if (legendData.IndexOf(NoDataForLegend) == -1) // { // //normally we shouldn't be there. This output just for debugging purposes and should be removed later (TODO) // foreach (Match match in matches) // { // //Console.WriteLine("Match: " + match.Value); // //GroupCollection groups = match.Groups; // //foreach (Group group in groups) // //{ // // Console.WriteLine("Group: " + group.Value); // //} // } // } //} //Console.WriteLine("Matching colored legend..."); //regex pattern with colors: <p class="helpfile"><b style="color: (\w+)">(.*)<\/b>(.*)<\/p> //should be multiple matches //group1: color name, group2: color name in russian (bold text), group3: non-bold text (description) var colorsRegex = new Regex(ColorsRegexPattern); var matches = colorsRegex.Matches(legendData); if (matches.Count != 0) { foreach (Match match in matches) { //Console.WriteLine("Match: " + match.Value); GroupCollection groups = match.Groups; var type = RouteTypeProvider.GetRouteType(groups[1].Value); var destinationRaw = groups[3].Value; //TODO: change regexp, no need in russian name of color var startIndex = destinationRaw.IndexOf(From); var destination = destinationRaw.Substring(startIndex); schedule.SetSpecialRoute(type, destination); } } } } return(schedule); }
public Schedule(ScheduleInfo info) { Info = info; Entries = new List <ScheduleEntry>(); }
private static async Task GetLists(HttpClient client) { for (var i = 0; i < TrType.TransportTypes.Length; i++) { var type = TrType.TransportTypes[i]; VerbosePrint("Obtaining routes for " + type); var routes = await GetRoutesList(client, type); foreach (var route in routes) { VerbosePrint("\tFound route: " + route); var days = await GetDaysOfOperation(client, type, route); if (days.Count == 0) { continue; //skip faulty routes without anything ("route", "streets", "stations") } foreach (var day in days) { VerbosePrint("\t\tWorks on " + day); //Direction names is not necessary, using AB/BA instead for iterating var directions = await GetDirections(client, type, route, day); if (directions.Count == 0) { continue; //skip faulty routes without directions (just in case if they appear in Mosgortans schedules) } for (var j = 0; j < Direction.Directions.Length; j++) { var dirCode = Direction.Directions[j]; var direction = directions[j]; VerbosePrint("\t\t\tFound direction: " + direction); var stops = await GetStops(client, type, route, day, dirCode); if (stops.Count == 0) { continue; //skip faulty routes without stops (this can occur when new routes are added to database, but without schedules) } for (var stopNum = 0; stopNum < stops.Count; stopNum++) { VerbosePrint("\t\t\t\tFound stop: " + stops[stopNum]); try { var scheduleInfo = new ScheduleInfo(type, route, day, dirCode, direction, stopNum, stops[stopNum]); lock (_siLock) { _siQueue.Enqueue(scheduleInfo); } Thread.Sleep(_sleepTime); } catch (Exception ex) //If we got exception - log it, and skip faulty item { Console.WriteLine("EXCEPTION OCCURED: " + ex.Message); continue; } } } } } } }
private static async void ParseThread(object clientParam) { var client = clientParam as HttpClient; ScheduleInfo scheduleInfo = new ScheduleInfo("avto", string.Empty, "0000000", "AB", string.Empty, -1, string.Empty); //TODO: deal with default values var formatStr = "{0};{1};{2};{3};'{4}';{5};'{6}';{7};{8};{9};{10};'{11}'"; int cnt = 0; while (true) { Thread.Sleep(_sleepTime); lock (_siLock) { cnt = _siQueue.Count; if (cnt != 0) { scheduleInfo = _siQueue.Dequeue(); } } if (_parseFinish && cnt == 0) //Thread finish condition { break; } if (cnt == 0) { continue; } var schedule = await GetSchedule(client, scheduleInfo); if (schedule == null) { continue; } foreach (var entry in schedule.Entries) { var si = schedule.GetInfo(); var tType = si.GetTransportTypeString(); var rName = si.RouteName; var ds = si.DaysOfOperation.ToString(); var dc = si.GetDirectionCodeString(); var dn = si.DirectionName; var snum = si.StopNumber; var sname = si.StopName; var valDat = schedule.ValidityTime.ToString("dd.MM.yyyy"); var hour = entry.Hour; var min = entry.Minute; var rType = entry.RouteType; var rDest = schedule.GetSpecialRoute(rType); var csvStr = string.Format(formatStr, tType, rName, ds, dc, dn, snum, sname, valDat, hour, min, rType, rDest); lock (_outLock) { _outputQueue.Enqueue(csvStr); } } } try { Thread.CurrentThread.Abort(); } catch (ThreadAbortException) { lock (_abortedLock) { _abortedCnt++; } } }