Exemple #1
0
    public int ParseUrl(HTTPRequest page)
    {
      int count = 0;

      if (_page != page)
      {
        HTMLPage webPage = new HTMLPage(page);
        _source = webPage.GetPage();
        _page = new HTTPRequest(page);
      }

      JSONNode root = JSONNode.LoadJSON(_source);
      try
      {
        _nodeList = root.GetNodes(_data.XPath, _data.ChannelFilter);
      }
      catch (Exception) // ex)
      {
        //Log.Error("WebEPG: JSON failed");
        return count;
      }

      if (_nodeList != null)
      {
        count = _nodeList.Count;
      }

      return count;
    }
    public void Base()
    {
      HTTPRequest request = new HTTPRequest("http://www.somewhere.com/path/path/file.html");

      Assert.IsTrue("www.somewhere.com" == request.Host);
      Assert.IsTrue("/path/path/file.html" == request.GetQuery);
    }
 public RequestBuilder(HTTPRequest baseRequest, WorldDateTime startTime, RequestData data)
 {
   _baseRequest = baseRequest;
   _requestTime = startTime;
   _data = data;
   _dayOffset = 0;
   _offset = 0;
 }
    public HTTPRequest GetRequest()
    {
      HTTPRequest request = new HTTPRequest(_baseRequest);
      CultureInfo culture = new CultureInfo(_data.SearchLang);

      if (_data.DayNames != null)
      {
        request.ReplaceTag("[DAY_NAME]", _data.DayNames[_dayOffset]);
      }

      if (_data.BaseDate != null)
      {
        DateTime basedt = DateTime.Parse(_data.BaseDate);
        request.ReplaceTag("[DAYS_SINCE]", _requestTime.DaysSince(basedt).ToString());
      }

      request.ReplaceTag("[ID]", _data.ChannelId);

      request.ReplaceTag("[DAY_OFFSET]", (_dayOffset + _data.OffsetStart).ToString());
      request.ReplaceTag("[EPOCH_TIME]", _requestTime.ToEpochTime().ToString());
      request.ReplaceTag("[EPOCH_DATE]", _requestTime.ToEpochDate().ToString());
      request.ReplaceTag("[DAYOFYEAR]", _requestTime.DateTime.DayOfYear.ToString());
      request.ReplaceTag("[YYYY]", _requestTime.Year.ToString());
      request.ReplaceTag("[MM]", String.Format("{0:00}", _requestTime.Month));
      request.ReplaceTag("[_M]", _requestTime.Month.ToString());
      request.ReplaceTag("[MONTH]", _requestTime.DateTime.ToString("MMMM", culture));
      request.ReplaceTag("[DD]", String.Format("{0:00}", _requestTime.Day));
      request.ReplaceTag("[_D]", _requestTime.Day.ToString());

      // this fix is needed for countries where the first day (0) is Sunday (not Monday)
      // thoose grabbers should include OffsetStart="1" in the Search tag.
      int dayNum = (int)_requestTime.DateTime.DayOfWeek + _data.OffsetStart;
      if (dayNum < 0)
      {
        dayNum += 7;
      }
      if (dayNum > 6)
      {
        dayNum = dayNum % 7;
      }
      request.ReplaceTag("[DAY_OF_WEEK]", dayNum.ToString());

      // check for script defined weekdayname and use it if found
      if (_data.WeekDayNames != null && dayNum < _data.WeekDayNames.Length && _data.WeekDayNames[dayNum] != String.Empty)
      {
        request.ReplaceTag("[WEEKDAY]", _data.WeekDayNames[dayNum]);
      }
      else
      {
        request.ReplaceTag("[WEEKDAY]", _requestTime.DateTime.ToString(_data.WeekDay, culture));
      }

      request.ReplaceTag("[LIST_OFFSET]", ((_offset * _data.MaxListingCount) + _data.ListStart).ToString());
      request.ReplaceTag("[PAGE_OFFSET]", (_offset + _data.PageStart).ToString());

      return request;
    }
Exemple #5
0
 public HTTPRequest(HTTPRequest request)
 {
   _scheme = request._scheme;
   _host = request._host;
   _getQuery = request._getQuery;
   _postQuery = request._postQuery;
   _cookies = request._cookies;
   _externalBrowser = request._externalBrowser;
   _encoding = request._encoding;
   _delay = request._delay;
 }
    public void ReplaceTag()
    {
      HTTPRequest request = new HTTPRequest("http://www.somewhere.com/[PATH]/[FILE]");
      request.PostQuery = "[PATH]?[POSTDATA]";

      request.ReplaceTag("[PATH]", "tagpath");
      Assert.IsTrue("/tagpath/[FILE]" == request.GetQuery);
      Assert.IsTrue("tagpath?[POSTDATA]" == request.PostQuery);

      request.ReplaceTag("[FILE]", "tagfile.html");
      Assert.IsTrue("/tagpath/tagfile.html" == request.GetQuery);

      request.ReplaceTag("[POSTDATA]", "tagpostdata");
      Assert.IsTrue("tagpath?tagpostdata" == request.PostQuery);
    }
    public void Add()
    {
      HTTPRequest requestBase = new HTTPRequest("http://www.somewhere.com/path/path/file.html");

      HTTPRequest request;
      request = requestBase.Add("../relpath/relfile.html");
      Assert.IsTrue("http://www.somewhere.com/path/relpath/relfile.html" == request.Url);

      request = requestBase.Add("/newpath/newfile.html");
      Assert.IsTrue("http://www.somewhere.com/newpath/newfile.html" == request.Url);

      request = requestBase.Add("http://www.somewhere_else.com/path/file.html");
      Assert.IsTrue("http://www.somewhere_else.com/path/file.html" == request.Url);

      requestBase = new HTTPRequest("http://spettacolo.alice.it/guidatv/cgi/index.cgi?tipo=3&channel=22");
      request = requestBase.Add("?tipo=1&qs=8353441");
      Assert.IsTrue("http://spettacolo.alice.it/guidatv/cgi/index.cgi?tipo=1&qs=8353441" == request.Url);
    }
Exemple #8
0
    public int ParseUrl(HTTPRequest page)
    {
      int count = 0;

      if (_page != page)
      {
        HTMLPage webPage = new HTMLPage(page);
        _source = webPage.GetPage();
        _page = new HTTPRequest(page);
      }

      XmlDocument _xmlDoc = new XmlDocument();
      try
      {
        _xmlDoc.LoadXml(_source);
        if (_data.Channel != string.Empty)
        {
          _nodeList =
            _xmlDoc.DocumentElement.SelectNodes(_data.XPath + "[@" + _data.Channel + "=\"" + _channelName + "\"]");
        }
        else
        {
          _nodeList = _xmlDoc.DocumentElement.SelectNodes(_data.XPath);
        }
      }
      catch (XmlException) // ex)
      {
        //Log.Error("WebEPG: XML failed");
        return count;
      }

      if (_nodeList != null)
      {
        count = _nodeList.Count;
      }

      return count;
    }
Exemple #9
0
    /// <summary>
    /// Parses the URL and returns the number of instances of the template found on this site
    /// </summary>
    /// <param name="site">The site.</param>
    /// <returns>count</returns>
    public int ParseUrl(HTTPRequest site)
    {
      HTMLPage webPage = new HTMLPage(site);
      string pageSource = webPage.GetPage();

      int startIndex = 0;
      if (_template.Start != null && _template.Start != string.Empty)
      {
        startIndex = pageSource.IndexOf(_template.Start, 0, StringComparison.OrdinalIgnoreCase);
        if (startIndex == -1)
        {
          startIndex = 0;
        }
        //log error ?
      }


      int endIndex = pageSource.Length;

      if (_template.End != null && _template.End != string.Empty)
      {
        endIndex = pageSource.IndexOf(_template.End, startIndex, StringComparison.OrdinalIgnoreCase);
        if (endIndex == -1)
        {
          endIndex = pageSource.Length;
        }
        //log error?
      }

      int count = 0;
      if (pageSource != null)
      {
          //run the replacement tasks before parsing the page source
          count = _profiler.MatchCount(Replacements(pageSource.Substring(startIndex, endIndex - startIndex)));
      }

      return count;
    }
    public void EqualOperator()
    {
      HTTPRequest request1 = new HTTPRequest("http://www.somewhere.com/1/1");
      HTTPRequest request2 = new HTTPRequest("http://www.somewhere.com/2/2");
      HTTPRequest request3 = new HTTPRequest("http://www.somewhere.com/1/1");

      Assert.IsTrue(request1 == request3);
      Assert.IsFalse(request1 == request2);
      Assert.IsTrue(request1 != request2);

      request1.PostQuery = "post1";
      request2.PostQuery = "post2";
      request3.PostQuery = "post1";

      Assert.IsTrue(request1 == request3);
      Assert.IsFalse(request1 == request2);
      Assert.IsTrue(request1 != request2);

      HTTPRequest request = null;

      Assert.IsTrue(request == null);
      Assert.IsTrue(request1 != null);
    }
 /// <summary>
 /// Initializes a new instance of the <see cref="HTMLPage"/> class.
 /// </summary>
 /// <param name="page">The page request.</param>
 public HTMLPage(HTTPRequest page) : this()
 {
     _encoding = page.Encoding;
     LoadPage(page);
 }
 /// <summary>
 /// Initializes a new instance of the <see cref="HTMLPage"/> class.
 /// </summary>
 /// <param name="page">The page request.</param>
 /// <param name="encoding">The encoding.</param>
 public HTMLPage(HTTPRequest page, string encoding) : this()
 {
     _encoding = encoding;
     LoadPage(page);
 }
 /// <summary>
 /// Performs HTTPRequest
 /// </summary>
 /// <param name="request">The request.</param>
 /// <returns></returns>
 public bool HTTPGet(HTTPRequest request)
 {
   return Transaction(request);
 }
 /// <summary>
 /// Initializes a new instance of the <see cref="HTTPTransaction"/> class and performs HTTPRequest.
 /// </summary>
 /// <param name="request">The request.</param>
 public HTTPTransaction(HTTPRequest request)
 {
   Transaction(request);
 }
    /// <summary>
    /// Performs HTTP transactions for the specified page request.
    /// </summary>
    /// <param name="pageRequest">The page request.</param>
    /// <returns>bool - Success/fail</returns>
    private bool Transaction(HTTPRequest pageRequest)
    {
      ArrayList Blocks = new ArrayList();
      byte[] Block;
      byte[] readBlock;
      int size;
      int totalSize;
      DateTime startTime = DateTime.Now;

      if (pageRequest.Delay > 0)
      {
        Thread.Sleep(pageRequest.Delay);
      }

      string agent = string.IsNullOrEmpty(pageRequest.UserAgent)? _agent : pageRequest.UserAgent;

      Uri pageUri = pageRequest.Uri;
      try
      {
        // Make the Webrequest
        // Create the request header
        HttpWebRequest request = (HttpWebRequest)WebRequest.Create(pageUri);
        try
        {
          // Use the current user in case an NTLM Proxy or similar is used.
          // request.Proxy = WebProxy.GetDefaultProxy();
          request.Proxy.Credentials = CredentialCache.DefaultCredentials;
        }
        catch (Exception) {}
        request.UserAgent = agent;
        request.AllowAutoRedirect = false;
        if (pageRequest.Cookies != string.Empty)
        {
          string[] cookiesArray = pageRequest.Cookies.Split(new Char[] {';'});
          foreach (string cookie in cookiesArray)
          {
            string[] cookieParts = cookie.Split(new Char[] {'='});
            if (cookieParts.Length >= 2)
            {
              if (_cookies == null)
                _cookies = new CookieCollection();
              _cookies.Add(new Cookie(cookieParts[0], cookieParts[1], "/", request.RequestUri.Host));
            }
          }
        }

        if (pageRequest.PostQuery == string.Empty)
        {
          // GET request
          if (_auth != null)
          {
            request.Credentials = _auth.Get(pageUri.Host);
          }
          request.CookieContainer = new CookieContainer();
          if (_cookies != null)
          {
            request.CookieContainer.Add(_cookies);
          }
        }
        else
        {
          // POST request
          request.ContentType = _postType;
          request.ContentLength = pageRequest.PostQuery.Length;
          request.Method = "POST";

          request.CookieContainer = new CookieContainer();
          if (_cookies != null)
            request.CookieContainer.Add(_cookies);

          // Write post message 
          try
          {
            Stream OutputStream = request.GetRequestStream();
            StreamWriter WriteStream = new StreamWriter(OutputStream);
            WriteStream.Write(pageRequest.PostQuery);
            WriteStream.Flush();
          }
          catch (WebException ex)
          {
            _error = ex.Message;
            return false;
          }
        }

        _response = (HttpWebResponse)request.GetResponse();

        // Check for redirection
        if ((_response.StatusCode == HttpStatusCode.Found) ||
            (_response.StatusCode == HttpStatusCode.Redirect) ||
            (_response.StatusCode == HttpStatusCode.Moved) ||
            (_response.StatusCode == HttpStatusCode.MovedPermanently))
        {
          Uri uri = new Uri(pageUri, _response.Headers["Location"]);
          HttpWebRequest redirect = (HttpWebRequest)WebRequest.Create(uri);
          try
          {
            // Use the current user in case an NTLM Proxy or similar is used.
            // request.Proxy = WebProxy.GetDefaultProxy();
            redirect.Proxy.Credentials = CredentialCache.DefaultCredentials;
          }
          catch (Exception) {}
          redirect.UserAgent = agent;
          redirect.AllowAutoRedirect = false;
          redirect.Referer = _response.ResponseUri.ToString();

          redirect.CookieContainer = new CookieContainer();
          if (_response.Headers["Set-Cookie"] != null)
          {
            string cookieStr = _response.Headers["Set-Cookie"];
            Regex cookieParser = new Regex("(?<name>[^=]+)=(?<value>[^;]+)(;)");
            Match result = cookieParser.Match(cookieStr);
            if (result.Success)
            {
              Cookie reply = new Cookie(result.Groups["name"].ToString(), result.Groups["value"].ToString());
              reply.Domain = uri.Host;
              redirect.CookieContainer.Add(reply);
            }
          }
          //redirect.ContentType = "text/html"; 
          _response = (HttpWebResponse)redirect.GetResponse();
        }

        if (request.CookieContainer != null)
        {
          _response.Cookies = request.CookieContainer.GetCookies(request.RequestUri);
          _cookies = _response.Cookies;
        }

        Stream ReceiveStream = _response.GetResponseStream();

        Block = new byte[blockSize];
        totalSize = 0;

        while ((size = ReceiveStream.Read(Block, 0, blockSize)) > 0)
        {
          readBlock = new byte[size];
          Array.Copy(Block, readBlock, size);
          Blocks.Add(readBlock);
          totalSize += size;
        }

        ReceiveStream.Close();
        _response.Close();

        int pos = 0;
        _data = new byte[totalSize];

        for (int i = 0; i < Blocks.Count; i++)
        {
          Block = (byte[])Blocks[i];
          Block.CopyTo(_data, pos);
          pos += Block.Length;
        }
      }
      catch (WebException ex)
      {
        _error = ex.Message;
        return false;
      }

      // Collect sits statistics
      if (_stats != null)
      {
        DateTime endTime = DateTime.Now;
        TimeSpan duration = endTime - startTime;
        _stats.Add(pageUri.Host, 1, _data.Length, duration);
      }

      return true;
    }
Exemple #16
0
 /// <summary>
 /// Loads a page from cache.
 /// </summary>
 /// <param name="pageUri">The page URI.</param>
 /// <returns>bool - true if the page is in the cache</returns>
 public bool LoadPage(HTTPRequest page)
 {
   if (_cacheMode == Mode.Enabled)
   {
     if (LoadCacheFile(GetCacheFileName(page)))
     {
       return true;
     }
   }
   return false;
 }
 /// <summary>
 /// Gets the page using external com browser IE.
 /// </summary>
 /// <param name="page">The page request.</param>
 /// <returns>true if successful</returns>
 private bool GetExternal(HTTPRequest page)
 {
     return(GetInternal(page));
 }
Exemple #18
0
    /// <summary>
    /// Initializes a new instance of the <see cref="WebParser"/> class.
    /// </summary>
    /// <param name="webTemplate">The web template.</param>
    public WebParser(WebParserTemplate webTemplate)
    {
      // Store the template
      _template = webTemplate;

      // Get default template -> currently only a default is supported
      // In the future template per channel ID can be added.
      HtmlParserTemplate listingTemplate = _template.GetTemplate("default");
      _listingPreference = _template.GetPreference("default");

      // Create dictionary for month strings
      Dictionary<string, int> monthsDict = null;
      if (_template.months != null)
      {
        // template contains months list -> load into dictionary
        monthsDict = new Dictionary<string, int>();
        for (int i = 0; i < _template.months.Length; i++)
        {
          monthsDict.Add(_template.months[i], i + 1);
          ;
        }
      }


      // create new Html Parser using default template
      _listingParser = new HtmlParser(listingTemplate, typeof (ProgramData), monthsDict);

      // setup sublink parser if template and config exists
      _sublinkParser = null;
      if (_template.sublinks != null && _template.sublinks.Count > 0)
      {
        // Load first sublink template -> only one supported now
        // future support for multiple sublinks possible
        SublinkInfo sublink = _template.sublinks[0];
        HtmlParserTemplate sublinkTemplate = _template.GetTemplate(sublink.template);
        _sublinkPreference = _template.GetPreference(sublinkTemplate.Name);

        if (sublinkTemplate != null)
        {
          // create sublink parser using template
          _sublinkParser = new HtmlParser(sublinkTemplate, typeof (ProgramData));
          _sublinkMatch = sublink.search;
          _sublinkRequest = sublink.url;
        }
      }
    }
Exemple #19
0
 /// <summary>
 /// Parses a site for a given URL.
 /// </summary>
 /// <param name="site">The request for the site to be parsed</param>
 /// <returns>number of instances of the tempate found</returns>
 public int ParseUrl(HTTPRequest site)
 {
   if (_sublinkParser != null && _sublinkRequest == null)
   {
     _sublinkRequest = new HTTPRequest(site);
     _sublinkRequest.Delay = 0; // do not use delay for sublinks unless explicitly requested
   }
   return _listingParser.ParseUrl(site);
 }
Exemple #20
0
    /// <summary>
    /// Deletes a cached page.
    /// </summary>
    /// <param name="pageUri">The page URI.</param>
    public void DeleteCachePage(HTTPRequest page)
    {
      string file = GetCacheFileName(page);

      if (File.Exists(file))
      {
        File.Delete(file);
      }
    }
Exemple #21
0
    /// <summary>
    /// Gets the name of the cache file.
    /// </summary>
    /// <param name="Page">The page.</param>
    /// <returns>filename</returns>
    private static string GetCacheFileName(HTTPRequest Page)
    {
      uint gethash = (uint)Page.Uri.GetHashCode();

      if (Page.PostQuery == null || Page.PostQuery == string.Empty)
      {
        return CACHE_DIR + "/" + Page.Host + "_" + gethash.ToString() + ".html";
      }

      uint posthash = (uint)Page.PostQuery.GetHashCode();

      return CACHE_DIR + "/" + Page.Host + "_" + gethash.ToString() + "_" + posthash.ToString() + ".html";
    }
Exemple #22
0
 /// <summary>
 /// Saves a page to the cache.
 /// </summary>
 /// <param name="pageUri">The page URI.</param>
 /// <param name="strSource">The HTML source.</param>
 public void SavePage(HTTPRequest page, string strSource)
 {
   if (_cacheMode != Mode.Disabled)
   {
     SaveCacheFile(GetCacheFileName(page), strSource);
   }
 }
        /// <summary>
        /// Gets the page using internal .NET
        /// </summary>
        /// <param name="page">The page request.</param>
        /// <returns>true if sucessful</returns>
        private bool GetInternal(HTTPRequest page)
        {
            // Use internal code to get HTML page
            HTTPTransaction Page = new HTTPTransaction();
            Encoding        encode;
            string          strEncode = _defaultEncode;

            if (Page.HTTPGet(page))
            {
                byte[] pageData = Page.GetData();
                int    i;

                if (_encoding != "")
                {
                    strEncode            = _encoding;
                    _pageEncodingMessage = "Forced: " + _encoding;
                }
                else
                {
                    encode         = System.Text.Encoding.GetEncoding(_defaultEncode);
                    _strPageSource = encode.GetString(pageData);
                    int headEnd;
                    if ((headEnd = _strPageSource.ToLower().IndexOf("</head")) != -1)
                    {
                        if ((i = _strPageSource.ToLower().IndexOf("charset", 0, headEnd)) != -1)
                        {
                            strEncode = "";
                            i        += 8;
                            for (; i < _strPageSource.Length && _strPageSource[i] != '\"'; i++)
                            {
                                strEncode += _strPageSource[i];
                            }
                            _encoding = strEncode;
                        }

                        if (strEncode == "")
                        {
                            strEncode            = _defaultEncode;
                            _pageEncodingMessage = "Default: " + _defaultEncode;
                        }
                        else
                        {
                            _pageEncodingMessage = strEncode;
                        }
                    }
                }

                Log.Debug("HTMLPage: GetInternal encoding: {0}", _pageEncodingMessage);
                // Encoding: depends on selected page
                if (string.IsNullOrEmpty(_strPageSource) || strEncode.ToLower() != _defaultEncode)
                {
                    try
                    {
                        encode         = System.Text.Encoding.GetEncoding(strEncode);
                        _strPageSource = encode.GetString(pageData);
                    }
                    catch (System.ArgumentException e)
                    {
                        Log.Write(e);
                    }
                }
                return(true);
            }
            _error = Page.GetError();
            if (!string.IsNullOrEmpty(_error))
            {
                Log.Error("HTMLPage: GetInternal error: {0}", _error);
            }
            return(false);
        }
Exemple #24
0
 public XmlParser(XmlParserTemplate data)
 {
   _page = null;
   _data = data;
   _dataType = typeof (ProgramData);
 }
Exemple #25
0
 // Add relative or absolute url
 public HTTPRequest Add(string relativeUri)
 {
   if (relativeUri.StartsWith("?"))
     relativeUri = Uri.LocalPath + relativeUri;
   Uri newUri = new Uri(Uri, relativeUri);
   HTTPRequest newHTTPRequest = new HTTPRequest(newUri);
   newHTTPRequest._encoding = this._encoding;
   return newHTTPRequest;
 }
Exemple #26
0
    /// <summary>
    /// Loads the page.
    /// </summary>
    /// <param name="page">The page request.</param>
    /// <returns>true if sucessful</returns>
    public bool LoadPage(HTTPRequest page)
    {
      if (_cache != null && _cache.Initialised)
      {
        if (_cache.LoadPage(page))
        {
          _strPageSource = _cache.GetPage();
          return true;
        }
      }

      // Delay before getting page
      if (page.Delay > 0)
        Thread.Sleep(page.Delay);

      bool success;

      if (page.External)
      {
        success = GetExternal(page);
      }
      else
      {
        success = GetInternal(page);
      }

      if (success)
      {
        if (_cache != null && _cache.Initialised)
        {
          _cache.SavePage(page, _strPageSource);
        }

        return true;
      }
      return false;
    }
        /// <summary>
        /// Gets the hyper link.
        /// </summary>
        /// <param name="index">The index.</param>
        /// <param name="match">The match.</param>
        /// <param name="linkURL">The link URL.</param>
        /// <returns>bool - success/fail</returns>
        public bool GetHyperLink(int index, string match, ref HTTPRequest linkURL)
        {
            string regex = "<(a |[^>]*onclick)[^>]*" + match + "[^>]*>"; //"<a .*? href=[^>]*" .ToLowerInvariant()

            string result = SearchRegex(index, regex, true, false);

            if (result == null)
            {
                return(false);
            }

            bool   linkFound  = false;
            string strLinkURL = string.Empty;

            int  start = -1;
            char delim = '>';

            if (result.ToLowerInvariant().IndexOf("href=") != -1)
            {
                start += result.ToLowerInvariant().IndexOf("href=") + 5;
            }
            if (result.ToLowerInvariant().IndexOf("onclick=") != -1)
            {
                start += result.ToLowerInvariant().IndexOf("onclick=") + 8;
            }
            if (result[start + 1] == '\"' || result[start + 1] == '\'')
            {
                start++;
                delim = result[start];
            }

            int end = -1;

            //if (delim != '>')
            //{
            //  start = -1;
            //  start = result.IndexOf(delim);
            //}
            if (start != -1)
            {
                end = result.IndexOf(delim, ++start);
            }

            if (end != -1)
            {
                strLinkURL = result.Substring(start, end - start);
                linkFound  = true;
            }

            if ((start = strLinkURL.IndexOf("=")) != -1)
            {
                for (int i = 0; i < strLinkURL.Length - start; i++)
                {
                    if (strLinkURL[start + i] == '\"' || strLinkURL[start + i] == '\'')
                    {
                        delim = strLinkURL[start + i];
                        start = start + i;
                        break;
                    }
                }

                end = -1;

                if (start != -1)
                {
                    end = strLinkURL.IndexOf(delim, ++start);
                }

                if (end != -1)
                {
                    strLinkURL = strLinkURL.Substring(start, end - start);
                }
            }

            string[] param = GetJavaSubLinkParams(result); //strLinkURL);
            if (param != null)
            {
                if (!linkURL.HasTag("[1]"))
                {
                    linkURL = linkURL.Add(HtmlString.ToAscii(param[0]));
                }
                else
                {
                    for (int i = 0; i < param.Length; i++)
                    {
                        linkURL.ReplaceTag("[" + (i + 1).ToString() + "]", HtmlString.ToAscii(param[i]));
                    }
                }
            }
            else
            {
                linkURL = linkURL.Add(HtmlString.ToAscii(strLinkURL.Trim()));
            }
            //}

            return(linkFound);
        }
Exemple #28
0
 public int ParseUrl(HTTPRequest site)
 {
   HTMLPage webPage = new HTMLPage(site);
   return _rows.RowCount(webPage.GetPage());
 }
Exemple #29
0
 /// <summary>
 /// Gets the page using external com browser IE.
 /// </summary>
 /// <param name="page">The page request.</param>
 /// <returns>true if successful</returns>
 private bool GetExternal(HTTPRequest page)
 {
   return GetInternal(page);
 }
Exemple #30
0
    /// <summary>
    /// Gets the data.
    /// </summary>
    /// <param name="index">The index.</param>
    /// <returns></returns>
    public IParserData GetData(int index)
    {
      // Perform any search and remove requests
      // Searches can search over the whole data 
      // optionally removing text so it will not be parsed with the main data
      ProgramData searchData = null;
      if (_template.searchList != null && _template.searchList.Count > 0)
      {
        searchData = new ProgramData();
        for (int i = 0; i < _template.searchList.Count; i++)
        {
          WebSearchData search = _template.searchList[i];
          string result = _listingParser.SearchRegex(index, search.Match, search.Remove);
          if (result != null)
          {
            searchData.SetElement(search.Field, result);
          }
        }
      }

      // Get the parsed data at index
      ProgramData data = ((ProgramData)_listingParser.GetData(index));
      if (data != null)
      {
        // Set the data preference -> important for merging data (eg data from sublink page)
        data.Preference = new DataPreference(_listingPreference);

        // If search data exists merge.
        if (searchData != null)
        {
          data.Merge(searchData);
        }

        // If there is a sublink parser, check for a matching sublink
        // sublink is not parsed here, because that may not be required 
        // the URL for the sublink will be built and stored for future use see GetLinkedData()
        if (_sublinkParser != null)
        {
          HTTPRequest sublinkRequest = new HTTPRequest(_sublinkRequest);
// Minimum delay disabled because of a bug it wasn't being used anyway (always 0) and
// possibly not needed to start using now.
// Enabling has serious impact on grabspeed.
//          if (sublinkRequest.Delay < 500)
//          {
//            sublinkRequest.Delay = 500;
//          }
          if (_listingParser.GetHyperLink(index, _sublinkMatch, ref sublinkRequest))
          {
            data.SublinkRequest = sublinkRequest;
          }
        }
      }
      return data;
    }
Exemple #31
0
    /// <summary>
    /// Gets the page using internal .NET
    /// </summary>
    /// <param name="page">The page request.</param>
    /// <returns>true if sucessful</returns>
    private bool GetInternal(HTTPRequest page)
    {
      // Use internal code to get HTML page

      using (HTTPTransaction Page = new HTTPTransaction())
      {
        Encoding encode;
        string strEncode = _defaultEncode;

        if (Page.HTTPGet(page))
        {
          byte[] pageData = Page.GetData();
          int i;

          if (_encoding != "")
          {
            strEncode = _encoding;
            _pageEncodingMessage = "Forced: " + _encoding;
          }
          else
          {
            encode = System.Text.Encoding.GetEncoding(_defaultEncode);
            _strPageSource = encode.GetString(pageData);
            int headEnd;
            if ((headEnd = _strPageSource.ToLower().IndexOf("</head")) != -1)
            {
              if ((i = _strPageSource.ToLower().IndexOf("charset", 0, headEnd)) != -1)
              {
                strEncode = "";
                i += 8;
                for (; i < _strPageSource.Length && _strPageSource[i] != '\"'; i++)
                {
                  strEncode += _strPageSource[i];
                }
                _encoding = strEncode;
              }

              if (strEncode == "")
              {
                strEncode = _defaultEncode;
                _pageEncodingMessage = "Default: " + _defaultEncode;
              }
              else
              {
                _pageEncodingMessage = strEncode;
              }
            }
          }

          GlobalServiceProvider.Get<ILog>().Debug("HTMLPage: GetInternal encoding: {0}", _pageEncodingMessage);
          // Encoding: depends on selected page
          if (string.IsNullOrEmpty(_strPageSource) || strEncode.ToLower() != _defaultEncode)
          {
            try
            {
              encode = System.Text.Encoding.GetEncoding(strEncode);
              _strPageSource = encode.GetString(pageData);
            }
            catch (System.ArgumentException e)
            {
              GlobalServiceProvider.Get<ILog>().Error(e);
            }
          }
          return true;
        }
        _error = Page.GetError();
        if (!string.IsNullOrEmpty(_error))
          GlobalServiceProvider.Get<ILog>().Error("HTMLPage: GetInternal error: {0}", _error);
        return false;
      }
    }
    /// <summary>
    /// Gets the hyper link.
    /// </summary>
    /// <param name="index">The index.</param>
    /// <param name="match">The match.</param>
    /// <param name="linkURL">The link URL.</param>
    /// <returns>bool - success/fail</returns>
    public bool GetHyperLink(int index, string match, ref HTTPRequest linkURL)
    {
      string regex = "<(a |[^>]*onclick)[^>]*" + match + "[^>]*>"; //"<a .*? href=[^>]*" .ToLowerInvariant()

      string result = SearchRegex(index, regex, true, false);

      if (result == null)
      {
        return false;
      }

      bool linkFound = false;
      string strLinkURL = string.Empty;

      int start = -1;
      char delim = '>';

      if (result.ToLowerInvariant().IndexOf("href=") != -1)
      {
        start += result.ToLowerInvariant().IndexOf("href=") + 5;
      }
      if (result.ToLowerInvariant().IndexOf("onclick=") != -1)
      {
        start += result.ToLowerInvariant().IndexOf("onclick=") + 8;
      }
      if (result[start + 1] == '\"' || result[start + 1] == '\'')
      {
        start++;
        delim = result[start];
      }

      int end = -1;
      //if (delim != '>')
      //{
      //  start = -1;
      //  start = result.IndexOf(delim);
      //}
      if (start != -1)
      {
        end = result.IndexOf(delim, ++start);
      }

      if (end != -1)
      {
        strLinkURL = result.Substring(start, end - start);
        linkFound = true;
      }

      if ((start = strLinkURL.IndexOf("=")) != -1)
      {
        for (int i = 0; i < strLinkURL.Length - start; i++)
        {
          if (strLinkURL[start + i] == '\"' || strLinkURL[start + i] == '\'')
          {
            delim = strLinkURL[start + i];
            start = start + i;
            break;
          }
        }

        end = -1;

        if (start != -1)
        {
          end = strLinkURL.IndexOf(delim, ++start);
        }

        if (end != -1)
        {
          strLinkURL = strLinkURL.Substring(start, end - start);
        }
      }

      string[] param = GetJavaSubLinkParams(result); //strLinkURL);
      if (param != null)
      {
        if (!linkURL.HasTag("[1]"))
        {
          linkURL = linkURL.Add(HtmlString.ToAscii(param[0]));
        }
        else
        {
          for (int i = 0; i < param.Length; i++)
          {
            linkURL.ReplaceTag("[" + (i + 1).ToString() + "]", HtmlString.ToAscii(param[i]));
          }
        }
      }
      else
      {
        linkURL = linkURL.Add(HtmlString.ToAscii(strLinkURL.Trim()));
      }
      //}

      return linkFound;
    }
Exemple #33
0
 /// <summary>
 /// Initializes a new instance of the <see cref="HTMLPage"/> class.
 /// </summary>
 /// <param name="page">The page request.</param>
 public HTMLPage(HTTPRequest page) : this()
 {
   _encoding = page.Encoding;
   LoadPage(page);
 }
Exemple #34
0
 /// <summary>
 /// Initializes a new instance of the <see cref="HTMLPage"/> class.
 /// </summary>
 /// <param name="page">The page request.</param>
 /// <param name="encoding">The encoding.</param>
 public HTMLPage(HTTPRequest page, string encoding) : this()
 {
   _encoding = encoding;
   LoadPage(page);
 }
    /// <summary>
    /// Gets the page using external com browser IE.
    /// </summary>
    /// <param name="page">The page request.</param>
    /// <returns>true if successful</returns>
    private bool GetExternal(HTTPRequest page)
    {
      // Use External Browser (IE) to get HTML page
      // IE downloads all linked graphics ads, etc
      // IE will run Javascript source if required to renderthe page
      if (_IE == null)
      {
        _IE = new InternetExplorer();
      }

      IWebBrowser2 webBrowser = (IWebBrowser2)_IE;

      object empty = Missing.Value;

      // check if request is POST or GET
      if (page.PostQuery != null)
      {
        ASCIIEncoding encoding = new ASCIIEncoding();
        object postData = (object)encoding.GetBytes(page.PostQuery);
        object header = (object)"Content-Type: application/x-www-form-urlencoded\n\r";
        webBrowser.Navigate(page.Url, ref empty, ref empty, ref postData, ref header);
      }
      else
      {
        webBrowser.Navigate(page.Url, ref empty, ref empty, ref empty, ref empty);
      }

      while (webBrowser.Busy == true)
      {
        Thread.Sleep(500);
      }
      HTMLDocumentClass doc = (HTMLDocumentClass)webBrowser.Document;

      _strPageSource = doc.body.innerHTML;

      return true;
    }