コード例 #1
0
        private static async Task <Schedule> GetSchedule(HttpClient client, ScheduleInfo si)
        {
            VerbosePrint("Obtaining schedule for stop");
            //encode cyrillic characters in route name, if they are present
            var name         = si.RouteName;
            var encodedRoute = EncodeCyrillicUri(name);

            var uri      = Uri.GetUri(si.GetTransportTypeString(), encodedRoute, si.DaysOfOperation.ToString(), si.GetDirectionCodeString(), si.StopNumber.ToString());
            var response = await GetHttpResponse(client, uri);

            if (response.Length == 0)
            {
                return(null);
            }

            return(ScheduleParser.Parse(response, si));
        }
コード例 #2
0
        //I know that parsing HTML by regex is a bad idea, but objective is not to use third-party parsers
        public static Schedule Parse(string htmlData, ScheduleInfo si)
        {
            var schedule = new Schedule(si);

            var index       = 0; //Current position of htmlData processing (shifts if some item is found and parsed)
            var searchIndex = 0; //Curent position of search ahead of index

            //<span class=\"hour\">(\d+)</span></td><td align=.*>(.*)</td>

            var hourRegex = new Regex(HourRegexPattern);
            //<span class="minutes" >02</span>
            //<span class="minutes" style="color: red; font-weight: bold;">37</span><br>

            var minuteRegex = new Regex(MinutesRegexPattern);

            //Starting routine

            searchIndex = htmlData.IndexOf(ValidityTimeSearchStr, index);
            if (searchIndex == -1)
            {
                throw new Exception("Validity time not found!");
            }

            index       = searchIndex + ValidityTimeSearchStr.Length;
            searchIndex = htmlData.IndexOf(TagBeginning, index);
            if (searchIndex == -1)
            {
                throw new Exception("Validity time border not found!");
            }

            var validityStr = htmlData.Substring(index, searchIndex - index);
            var date        = ParseDateTime(validityStr);

            schedule.ValidityTime = date;
            //Console.WriteLine("Schedule valid from: " + date.ToString());
            index = searchIndex;

            //Iterative hours and minutes parsing
            do
            {
                //Parsing hours
                searchIndex = htmlData.IndexOf(HourSearchStr, index);
                if (searchIndex == -1)
                {
                    break;
                }
                index       = searchIndex;
                searchIndex = htmlData.IndexOf(TdClosingTag, index);

                //Grey hours will be ignored because they doesn't match to regexp
                sbyte hour      = -1;
                var   hourStr   = htmlData.Substring(index, searchIndex - index);
                var   hourMatch = hourRegex.Match(hourStr);
                if (hourMatch.Length == 0)
                {
                    throw new Exception("Failed to find hour info!");
                }
                else
                {
                    hour = Convert.ToSByte(hourMatch.Groups[1].Value);
                }

                //Getting substring for all minutes in hour, then searching for all of it

                //Parsing minutes
                searchIndex = htmlData.IndexOf(MinutesSearchStr, index);
                index       = searchIndex;
                var minutesBorderIndex = htmlData.IndexOf(TdClosingTag, index);
                var allMinutesStr      = htmlData.Substring(index, minutesBorderIndex - index);
                var minuteIndex        = 0; //local varibles for searching inside minutes substring
                var minuteSearchIndex  = 0;

                while (minuteSearchIndex != -1)
                {
                    minuteSearchIndex = allMinutesStr.IndexOf(SpanClosingTag, minuteIndex);
                    if (minuteSearchIndex == -1)
                    {
                        break;
                    }
                    var minuteStr = allMinutesStr.Substring(minuteIndex, minuteSearchIndex - minuteIndex + SpanClosingTag.Length);
                    minuteIndex = minuteSearchIndex + SpanClosingTag.Length;
                    var minuteMatch = minuteRegex.Match(minuteStr);
                    if (minuteMatch.Length == 0)
                    {
                        throw new Exception("Failed to find minute info!");
                    }
                    else
                    {
                        if (hour == -1)
                        {
                            throw new Exception("Hours parser f****d up!");
                        }
                        var minute       = Convert.ToSByte(minuteMatch.Groups[3].Value);
                        var specialRoute = minuteMatch.Groups[2].Value;
                        if (specialRoute.Length > 1)
                        {
                            var type = RouteTypeProvider.GetRouteType(specialRoute);
                            schedule.AddEntry(new ScheduleEntry(hour, minute, type)); //special route
                        }
                        else
                        {
                            schedule.AddEntry(new ScheduleEntry(hour, minute)); //regular route
                        }
                    }
                }

                //all minutes for current hour found, skipping to next hour
                index       = minutesBorderIndex;
                searchIndex = index;
            }while (searchIndex > 0);

            //Parsing the legend
            searchIndex = htmlData.IndexOf(LegendHeaderStr, index);
            if (searchIndex != 0)
            {
                index       = searchIndex + LegendHeaderStr.Length;
                searchIndex = htmlData.IndexOf(TdClosingTag, index);
                if (searchIndex != 0)
                {
                    var legendData = htmlData.Substring(index, searchIndex - index);

                    //.*? for non-greedy match instead of greedy .*
                    //const string noColorsRegexPattern = "<p class=\"helpfile\"><b>(.*)<\\/b>(.*)<\\/p>";


                    //regex pattern without colors: <p class="helpfile"><b>(.*)<\/b>(.*)<\/p>
                    //group1: bold text, group2: non-bold text (check for empty!)

                    //var noColorsRegex = new Regex(noColorsRegexPattern);
                    //var matches = noColorsRegex.Matches(legendData);

                    //Console.WriteLine("Matching non-colored legend...");

                    //if (matches.Count == 0)
                    //    Console.WriteLine("Non-colored regex not matched!");
                    //else
                    //{
                    //    if (legendData.IndexOf(NoDataForLegend) == -1)
                    //    {
                    //        //normally we shouldn't be there. This output just for debugging purposes and should be removed later (TODO)
                    //        foreach (Match match in matches)
                    //        {
                    //            //Console.WriteLine("Match: " + match.Value);
                    //            //GroupCollection groups = match.Groups;
                    //            //foreach (Group group in groups)
                    //            //{
                    //            //    Console.WriteLine("Group: " + group.Value);
                    //            //}
                    //        }
                    //    }

                    //}

                    //Console.WriteLine("Matching colored legend...");
                    //regex pattern with colors: <p class="helpfile"><b style="color: (\w+)">(.*)<\/b>(.*)<\/p>
                    //should be multiple matches
                    //group1: color name, group2: color name in russian (bold text), group3: non-bold text (description)

                    var colorsRegex = new Regex(ColorsRegexPattern);
                    var matches     = colorsRegex.Matches(legendData);

                    if (matches.Count != 0)
                    {
                        foreach (Match match in matches)
                        {
                            //Console.WriteLine("Match: " + match.Value);
                            GroupCollection groups         = match.Groups;
                            var             type           = RouteTypeProvider.GetRouteType(groups[1].Value);
                            var             destinationRaw = groups[3].Value; //TODO: change regexp, no need in russian name of color
                            var             startIndex     = destinationRaw.IndexOf(From);
                            var             destination    = destinationRaw.Substring(startIndex);
                            schedule.SetSpecialRoute(type, destination);
                        }
                    }
                }
            }
            return(schedule);
        }
コード例 #3
0
ファイル: Schedule.cs プロジェクト: Snarkorel/mgt_parser
 public Schedule(ScheduleInfo info)
 {
     Info    = info;
     Entries = new List <ScheduleEntry>();
 }
コード例 #4
0
        private static async Task GetLists(HttpClient client)
        {
            for (var i = 0; i < TrType.TransportTypes.Length; i++)
            {
                var type = TrType.TransportTypes[i];
                VerbosePrint("Obtaining routes for " + type);
                var routes = await GetRoutesList(client, type);

                foreach (var route in routes)
                {
                    VerbosePrint("\tFound route: " + route);
                    var days = await GetDaysOfOperation(client, type, route);

                    if (days.Count == 0)
                    {
                        continue; //skip faulty routes without anything ("route", "streets", "stations")
                    }
                    foreach (var day in days)
                    {
                        VerbosePrint("\t\tWorks on " + day);
                        //Direction names is not necessary, using AB/BA instead for iterating
                        var directions = await GetDirections(client, type, route, day);

                        if (directions.Count == 0)
                        {
                            continue; //skip faulty routes without directions (just in case if they appear in Mosgortans schedules)
                        }
                        for (var j = 0; j < Direction.Directions.Length; j++)
                        {
                            var dirCode   = Direction.Directions[j];
                            var direction = directions[j];
                            VerbosePrint("\t\t\tFound direction: " + direction);
                            var stops = await GetStops(client, type, route, day, dirCode);

                            if (stops.Count == 0)
                            {
                                continue; //skip faulty routes without stops (this can occur when new routes are added to database, but without schedules)
                            }
                            for (var stopNum = 0; stopNum < stops.Count; stopNum++)
                            {
                                VerbosePrint("\t\t\t\tFound stop: " + stops[stopNum]);

                                try
                                {
                                    var scheduleInfo = new ScheduleInfo(type, route, day, dirCode, direction, stopNum, stops[stopNum]);
                                    lock (_siLock)
                                    {
                                        _siQueue.Enqueue(scheduleInfo);
                                    }
                                    Thread.Sleep(_sleepTime);
                                }
                                catch (Exception ex) //If we got exception - log it, and skip faulty item
                                {
                                    Console.WriteLine("EXCEPTION OCCURED: " + ex.Message);
                                    continue;
                                }
                            }
                        }
                    }
                }
            }
        }
コード例 #5
0
        private static async void ParseThread(object clientParam)
        {
            var          client       = clientParam as HttpClient;
            ScheduleInfo scheduleInfo = new ScheduleInfo("avto", string.Empty, "0000000", "AB", string.Empty, -1, string.Empty); //TODO: deal with default values
            var          formatStr    = "{0};{1};{2};{3};'{4}';{5};'{6}';{7};{8};{9};{10};'{11}'";
            int          cnt          = 0;

            while (true)
            {
                Thread.Sleep(_sleepTime);

                lock (_siLock)
                {
                    cnt = _siQueue.Count;
                    if (cnt != 0)
                    {
                        scheduleInfo = _siQueue.Dequeue();
                    }
                }

                if (_parseFinish && cnt == 0) //Thread finish condition
                {
                    break;
                }

                if (cnt == 0)
                {
                    continue;
                }

                var schedule = await GetSchedule(client, scheduleInfo);

                if (schedule == null)
                {
                    continue;
                }

                foreach (var entry in schedule.Entries)
                {
                    var si     = schedule.GetInfo();
                    var tType  = si.GetTransportTypeString();
                    var rName  = si.RouteName;
                    var ds     = si.DaysOfOperation.ToString();
                    var dc     = si.GetDirectionCodeString();
                    var dn     = si.DirectionName;
                    var snum   = si.StopNumber;
                    var sname  = si.StopName;
                    var valDat = schedule.ValidityTime.ToString("dd.MM.yyyy");
                    var hour   = entry.Hour;
                    var min    = entry.Minute;
                    var rType  = entry.RouteType;
                    var rDest  = schedule.GetSpecialRoute(rType);
                    var csvStr = string.Format(formatStr, tType, rName, ds, dc, dn, snum, sname, valDat, hour, min, rType, rDest);

                    lock (_outLock)
                    {
                        _outputQueue.Enqueue(csvStr);
                    }
                }
            }
            try
            {
                Thread.CurrentThread.Abort();
            }
            catch (ThreadAbortException)
            {
                lock (_abortedLock)
                {
                    _abortedCnt++;
                }
            }
        }