Example #1
0
        /// <summary>
        /// Checks if a Url belongs to the part of the web we wish to crawl.
        /// </summary>
        /// <param name="targetUrl">The url to examine</param>
        /// <returns>
        /// A <see cref="DomainFlagValue"/> indicating whether the input Url belongs to the
        /// part of the web we wish to crawl.
        /// </returns>
        /// <remarks>
        /// Since it is possible for a url that belongs to a non-greek domain (e.g. .com) to
        /// contain greek content, all Urls that do not belong to the .gr domain will get the
        /// value of <see cref="DomainFlagValue.Unknown"/>. This allows the system to assign
        /// them at a later time to a client who will visit them and check their content-type
        /// encoding, in order to determine whether they are of interest to the system.
        /// </remarks>
        public DomainFlagValue FilterUrl(ref string targetUrl)
        {
            DomainFlagValue retVal = DomainFlagValue.Unknown;

            try
            {
                mutex.WaitOne();
                string targetHost = InternetUtils.HostName(targetUrl);
                if (targetHost.EndsWith(".gr") || targetHost.Contains("ebay.com"))
                {
                    retVal = DomainFlagValue.MustVisit;
                }
                else
                {
                    if (IsIPAddress(ref targetHost))
                    {
                        if (ipTable.GetCountry(targetHost) == "GR")
                        {
                            retVal = DomainFlagValue.MustVisit;
                        }
                    }
                }
            }
            catch (Exception e)
            {
                globals.FileLog.LogWarning(e.ToString());
            }
            finally
            {
                mutex.ReleaseMutex();
            }
            return(retVal);
        }
Example #2
0
 /// <summary>
 /// Constructs a new <see cref="InternetUrlToIndex"/> object from an existing <see cref="InternetUrlToIndex"/> object.
 /// </summary>
 /// <param name="IUrl">The existing <see cref="InternetUrlToIndex"/> object</param>
 public InternetUrlToIndex(InternetUrlToIndex IUrl)
 {
     ID  = IUrl.ID;
     Url = IUrl.Url;
     //m_UrlMD5=IUrlVal.MD5;
     flagRobots = IUrl.FlagRobots;
     flagDomain = IUrl.FlagDomain;
     priority   = IUrl.Priority;
 }
Example #3
0
 /// <summary>
 /// Constructs a new <see cref="InternetUrlToCrawl"/> object from an existing <see cref="InternetUrlToCrawl"/> object.
 /// </summary>
 /// <param name="IUrl">The existing <see cref="InternetUrlToCrawl"/> object</param>
 public InternetUrlToCrawl(InternetUrlToCrawl IUrl)
 {
     ID  = IUrl.ID;
     Url = IUrl.Url;
     //m_UrlMD5=IUrlVal.MD5;
     crc                   = IUrl.CRC;
     flagDomain            = IUrl.FlagDomain;
     flagFetchRobots       = IUrl.FlagFetchRobots;
     robotsDisallowedPaths = IUrl.RobotsDisallowedPaths;
 }
Example #4
0
 /// <summary>
 /// Constructs a new instance of the <see cref="InternetUrlToCrawl"/> class.
 /// </summary>
 public InternetUrlToCrawl()
 {
     ID = 0;
     //Url=String.Empty;
     //MD5 = new byte[16]{0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0};
     crc                   = 0L;
     flagDomain            = DomainFlagValue.Unknown;
     flagFetchRobots       = false;
     robotsDisallowedPaths = String.Empty;
 }
Example #5
0
 /// <summary>
 /// Constructs a new instance of the <see cref="InternetUrl"/> class.
 /// </summary>
 public InternetUrlToIndex()
 {
     ID  = 0;
     Url = String.Empty;
     MD5 = new byte[16] {
         0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0
     };
     flagRobots = false;
     flagDomain = DomainFlagValue.MustVisit;
     priority   = 255;
 }
Example #6
0
 /// <summary>
 /// Constructs an instance of the <see cref="UrlCrawlData"/> class and initializes
 /// it with the default values.
 /// </summary>
 public UrlCrawlData()
 {
     url                  = new InternetUrlToCrawl();
     updated              = false;
     redirected           = false;
     redirectedFlagRobots = false;
     redirectedFlagDomain = DomainFlagValue.MustVisit;
     redirectedPriority   = 255;
     httpStatusCode       = HttpStatusCode.OK;
     data                 = String.Empty;
     timeStamp            = DateTime.UtcNow;
     retrievalTime        = 0;
     outLinks             = null;//new ArrayList();
 }
Example #7
0
 /// <summary>
 /// Constructs a new instance of the <see cref="InternetUrlToIndex"/> class that
 /// points to a given Url and is associated with a given Identifier. All the
 /// parameters of the newly created instance are supplied and initialized.
 /// </summary>
 /// <param name="ID">The unique identifier associated with this <see cref="InternetUrlToIndex"/></param>
 /// <param name="url">The Url that this object encapsulates</param>
 /// <param name="md5">The MD5 Hash Code for this <see cref="InternetUrlToIndex"/> object</param>
 /// <param name="flagRobots">A boolean flag indicating whether it's allowed to visit the Url</param>
 /// <param name="flagDomain">A <see cref="DomainFlagValue"/> flag indicating whether the Url belongs to the domain we wish to visit</param>
 /// <param name="priority">A <see cref="System.Byte"/> flag indicating the url's crawling priority</param>
 /// <exception cref="CWMalformedUrlException">
 /// Thrown if the supplied string value is not a valid Url.
 /// </exception>
 public InternetUrlToIndex(int ID, string url, byte[] md5, bool flagRobots, DomainFlagValue flagDomain, byte priority)
 {
     Url     = url;         //this may throw an exception
     this.ID = ID;
     //			try
     //			{
     //				Uri tmp = new Uri(UrlVal); //this may throw an exception
     //				Url = UrlVal;
     //			}
     //			catch
     //			{
     //				throw new CWMalformedUrlException("The address " + UrlVal + " is not a valid Uri.");
     //			}
     MD5             = md5;
     this.flagRobots = flagRobots;
     this.flagDomain = flagDomain;
     this.priority   = priority;
 }
Example #8
0
 /// <summary>
 /// Constructs a new instance of the <see cref="InternetUrlToIndex"/> class that
 /// points to a given Url and is associated with a given Identifier. Assumes that
 /// it's allowed to visit this Url (the <see cref="FlagRobots"/> property is set
 /// to false) and that it belongs to the domain we wish to visit.
 /// </summary>
 /// <param name="ID">The unique identifier associated with this <see cref="InternetUrlToIndex"/></param>
 /// <param name="url">The Url that this object encapsulates</param>
 /// <exception cref="CWMalformedUrlException">
 /// Thrown if the supplied string value is not a valid Url.
 /// </exception>
 public InternetUrlToIndex(int ID, string url)
 {
     Url     = url;         //this may throw an exception
     this.ID = ID;
     //			try
     //			{
     //				Uri tmp = new Uri(UrlVal); //this may throw an exception
     //				Url = UrlVal;
     //			}
     //			catch
     //			{
     //				throw new CWMalformedUrlException("The address " + UrlVal + " is not a valid Uri.");
     //			}
     //m_UrlMD5=MD5Hash.md5(UrlVal);
     flagRobots = false;
     flagDomain = DomainFlagValue.MustVisit;
     priority   = 255;
 }
Example #9
0
 /// <summary>
 /// Constructs a new instance of the <see cref="InternetUrlToCrawl"/> class that
 /// points to a given Url and is associated with a given Identifier. All of the
 /// parameters of the newly created instance are supplied and initialized.
 /// </summary>
 /// <param name="ID">The unique identifier associated with this <see cref="InternetUrl"/>.</param>
 /// <param name="url">The Url that this object encapsulates.</param>
 /// <param name="md5">The MD5 Hash Code for this <see cref="InternetUrl"/> object.</param>
 /// <param name="crc">The Cyclic Redundancy Check value associated with the <see cref="InternetUrl"/> object.</param>
 /// <param name="flagDomain">A <see cref="DomainFlagValue"/> flag indicating whether the crawler must check the country of origin of the contents of the Url.</param>
 /// <param name="flagFetchRobots">A boolean flag indicating whether a new copy of the robots.txt file related with this Url must be downloaded.</param>
 /// <param name="robotsDisallowedPaths">A string containing the space-delimited paths on a host that the crawler must not visit.</param>
 /// <exception cref="CWMalformedUrlException">
 /// Thrown if the supplied string value is not a valid Uri.
 /// </exception>
 public InternetUrlToCrawl(int ID, string url, byte[] md5, long crc, DomainFlagValue flagDomain, bool flagFetchRobots, string robotsDisallowedPaths)
 {
     Url     = url;         //this may throw an exception
     this.ID = ID;
     //			try
     //			{
     //				Uri tmp = new Uri(UrlVal); //this may throw an exception
     //				Url = UrlVal;
     //			}
     //			catch
     //			{
     //				throw new CWMalformedUrlException("The address " + UrlVal + " is not a valid Uri.");
     //			}
     MD5                        = md5;
     this.crc                   = crc;
     this.flagDomain            = flagDomain;
     this.flagFetchRobots       = flagFetchRobots;
     this.robotsDisallowedPaths = robotsDisallowedPaths;
 }
Example #10
0
 /// <summary>
 /// Constructs a new instance of the <see cref="InternetUrlToCrawl"/> class that
 /// points to a given Url and is associated with a given Identifier.
 /// </summary>
 /// <param name="ID">The unique identifier associated with this <see cref="InternetUrlToCrawl"/>.</param>
 /// <param name="url">The Url that this object encapsulates.</param>
 /// <exception cref="CWMalformedUrlException">
 /// Thrown if the supplied string value is not a valid Uri.
 /// </exception>
 public InternetUrlToCrawl(int ID, string url)
 {
     Url     = url;         //this may throw an exception
     this.ID = ID;
     //			try
     //			{
     //				Uri tmp = new Uri(UrlVal); //this may throw an exception
     //				Url = UrlVal;
     //			}
     //			catch
     //			{
     //				throw new CWMalformedUrlException("The address " + UrlVal + " is not a valid Uri.");
     //			}
     //m_UrlMD5=MD5Hash.md5(UrlVal);
     crc                   = 0L;
     flagDomain            = DomainFlagValue.Unknown;
     flagFetchRobots       = false;
     robotsDisallowedPaths = String.Empty;
 }
Example #11
0
        /// <summary>
        /// Attempts to extract the appropriate FlagDomain value from the contents of the document.
        /// </summary>
        /// <param name="content">The HTML content that must be parsed for Domain Flag value.</param>
        /// <returns>A <see cref="DomainFlagValue"/> indicating whether the content of the
        /// text document is in the language that interests us.
        /// </returns>
        private DomainFlagValue ExtractDomainFlag(ref string content)
        {
            DomainFlagValue retVal = DomainFlagValue.Unknown;

            try
            {
                //attempt to find any greek character in the contents of the document.
                if (content.IndexOfAny(greekChars) != -1)
                {
                    //a greek character was found, assume the content is in greek
                    retVal = DomainFlagValue.MustVisit;
                }
                else
                {
                    retVal = DomainFlagValue.MustNotVisit;
                }
            }
            catch
            {}
            return(retVal);
        }
Example #12
0
 /// <summary>
 /// Constructs a new instance of the <see cref="InternetUrlToIndex"/> class that 
 /// points to a given Url and is associated with a given Identifier. All the
 /// parameters of the newly created instance are supplied and initialized.
 /// </summary>
 /// <param name="ID">The unique identifier associated with this <see cref="InternetUrlToIndex"/></param>
 /// <param name="url">The Url that this object encapsulates</param>
 /// <param name="md5">The MD5 Hash Code for this <see cref="InternetUrlToIndex"/> object</param>
 /// <param name="flagRobots">A boolean flag indicating whether it's allowed to visit the Url</param>
 /// <param name="flagDomain">A <see cref="DomainFlagValue"/> flag indicating whether the Url belongs to the domain we wish to visit</param>
 /// <param name="priority">A <see cref="System.Byte"/> flag indicating the url's crawling priority</param>
 /// <exception cref="CWMalformedUrlException">
 /// Thrown if the supplied string value is not a valid Url.
 /// </exception>
 public InternetUrlToIndex(int ID, string url, byte[] md5, bool flagRobots, DomainFlagValue flagDomain, byte priority)
 {
     Url = url; //this may throw an exception
     this.ID = ID;
     //			try
     //			{
     //				Uri tmp = new Uri(UrlVal); //this may throw an exception
     //				Url = UrlVal;
     //			}
     //			catch
     //			{
     //				throw new CWMalformedUrlException("The address " + UrlVal + " is not a valid Uri.");
     //			}
     MD5 = md5;
     this.flagRobots = flagRobots;
     this.flagDomain = flagDomain;
     this.priority = priority;
 }
Example #13
0
 /// <summary>
 /// Constructs a new <see cref="InternetUrlToIndex"/> object from an existing <see cref="InternetUrlToIndex"/> object.
 /// </summary>
 /// <param name="IUrl">The existing <see cref="InternetUrlToIndex"/> object</param>
 public InternetUrlToIndex(InternetUrlToIndex IUrl)
 {
     ID = IUrl.ID;
     Url = IUrl.Url;
     //m_UrlMD5=IUrlVal.MD5;
     flagRobots = IUrl.FlagRobots;
     flagDomain = IUrl.FlagDomain;
     priority = IUrl.Priority;
 }
Example #14
0
 /// <summary>
 /// Constructs a new instance of the <see cref="InternetUrl"/> class.
 /// </summary>
 public InternetUrlToIndex()
 {
     ID = 0;
     Url = String.Empty;
     MD5 = new byte[16] { 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 };
     flagRobots = false;
     flagDomain = DomainFlagValue.MustVisit;
     priority = 255;
 }
Example #15
0
 /// <summary>
 /// Constructs a new instance of the <see cref="InternetUrlToIndex"/> class that
 /// points to a given Url and is associated with a given Identifier. Assumes that
 /// it's allowed to visit this Url (the <see cref="FlagRobots"/> property is set
 /// to false) and that it belongs to the domain we wish to visit.
 /// </summary>
 /// <param name="ID">The unique identifier associated with this <see cref="InternetUrlToIndex"/></param>
 /// <param name="url">The Url that this object encapsulates</param>
 /// <exception cref="CWMalformedUrlException">
 /// Thrown if the supplied string value is not a valid Url.
 /// </exception>
 public InternetUrlToIndex(int ID, string url)
 {
     Url = url; //this may throw an exception
     this.ID = ID;
     //			try
     //			{
     //				Uri tmp = new Uri(UrlVal); //this may throw an exception
     //				Url = UrlVal;
     //			}
     //			catch
     //			{
     //				throw new CWMalformedUrlException("The address " + UrlVal + " is not a valid Uri.");
     //			}
     //m_UrlMD5=MD5Hash.md5(UrlVal);
     flagRobots = false;
     flagDomain = DomainFlagValue.MustVisit;
     priority = 255;
 }
Example #16
0
 /// <summary>
 /// Constructs a new <see cref="InternetUrlToCrawl"/> object from an existing <see cref="InternetUrlToCrawl"/> object.
 /// </summary>
 /// <param name="IUrl">The existing <see cref="InternetUrlToCrawl"/> object</param>
 public InternetUrlToCrawl(InternetUrlToCrawl IUrl)
 {
     ID = IUrl.ID;
     Url = IUrl.Url;
     //m_UrlMD5=IUrlVal.MD5;
     crc = IUrl.CRC;
     flagDomain = IUrl.FlagDomain;
     flagFetchRobots = IUrl.FlagFetchRobots;
     robotsDisallowedPaths = IUrl.RobotsDisallowedPaths;
 }
Example #17
0
 /// <summary>
 /// Constructs a new instance of the <see cref="InternetUrlToCrawl"/> class that
 /// points to a given Url and is associated with a given Identifier. All of the
 /// parameters of the newly created instance are supplied and initialized.
 /// </summary>
 /// <param name="ID">The unique identifier associated with this <see cref="InternetUrl"/>.</param>
 /// <param name="url">The Url that this object encapsulates.</param>
 /// <param name="md5">The MD5 Hash Code for this <see cref="InternetUrl"/> object.</param>
 /// <param name="crc">The Cyclic Redundancy Check value associated with the <see cref="InternetUrl"/> object.</param>
 /// <param name="flagDomain">A <see cref="DomainFlagValue"/> flag indicating whether the crawler must check the country of origin of the contents of the Url.</param>
 /// <param name="flagFetchRobots">A boolean flag indicating whether a new copy of the robots.txt file related with this Url must be downloaded.</param>
 /// <param name="robotsDisallowedPaths">A string containing the space-delimited paths on a host that the crawler must not visit.</param>
 /// <exception cref="CWMalformedUrlException">
 /// Thrown if the supplied string value is not a valid Uri.
 /// </exception>
 public InternetUrlToCrawl(int ID, string url, byte[] md5, long crc, DomainFlagValue flagDomain, bool flagFetchRobots, string robotsDisallowedPaths)
 {
     Url = url; //this may throw an exception
     this.ID = ID;
     //			try
     //			{
     //				Uri tmp = new Uri(UrlVal); //this may throw an exception
     //				Url = UrlVal;
     //			}
     //			catch
     //			{
     //				throw new CWMalformedUrlException("The address " + UrlVal + " is not a valid Uri.");
     //			}
     MD5 = md5;
     this.crc = crc;
     this.flagDomain = flagDomain;
     this.flagFetchRobots = flagFetchRobots;
     this.robotsDisallowedPaths = robotsDisallowedPaths;
 }
Example #18
0
 /// <summary>
 /// Constructs a new instance of the <see cref="InternetUrlToCrawl"/> class that 
 /// points to a given Url and is associated with a given Identifier.
 /// </summary>
 /// <param name="ID">The unique identifier associated with this <see cref="InternetUrlToCrawl"/>.</param>
 /// <param name="url">The Url that this object encapsulates.</param>
 /// <exception cref="CWMalformedUrlException">
 /// Thrown if the supplied string value is not a valid Uri.
 /// </exception>
 public InternetUrlToCrawl(int ID, string url)
 {
     Url = url; //this may throw an exception
     this.ID = ID;
     //			try
     //			{
     //				Uri tmp = new Uri(UrlVal); //this may throw an exception
     //				Url = UrlVal;
     //			}
     //			catch
     //			{
     //				throw new CWMalformedUrlException("The address " + UrlVal + " is not a valid Uri.");
     //			}
     //m_UrlMD5=MD5Hash.md5(UrlVal);
     crc = 0L;
     flagDomain = DomainFlagValue.Unknown;
     flagFetchRobots = false;
     robotsDisallowedPaths = String.Empty;
 }
Example #19
0
 /// <summary>
 /// Constructs a new instance of the <see cref="InternetUrlToCrawl"/> class.
 /// </summary>
 public InternetUrlToCrawl()
 {
     ID = 0;
     //Url=String.Empty;
     //MD5 = new byte[16]{0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0};
     crc = 0L;
     flagDomain = DomainFlagValue.Unknown;
     flagFetchRobots = false;
     robotsDisallowedPaths = String.Empty;
 }
Example #20
0
 /// <summary>
 /// Constructs an instance of the <see cref="UrlCrawlData"/> class and initializes
 /// it with the default values.
 /// </summary>
 public UrlCrawlData()
 {
     url=new InternetUrlToCrawl();
     updated=false;
     redirected=false;
     redirectedFlagRobots = false;
     redirectedFlagDomain = DomainFlagValue.MustVisit;
     redirectedPriority = 255;
     httpStatusCode=HttpStatusCode.OK;
     data=String.Empty;
     timeStamp=DateTime.UtcNow;
     retrievalTime=0;
     outLinks=null;//new ArrayList();
 }