/// <summary> /// Checks if a Url belongs to the part of the web we wish to crawl. /// </summary> /// <param name="targetUrl">The url to examine</param> /// <returns> /// A <see cref="DomainFlagValue"/> indicating whether the input Url belongs to the /// part of the web we wish to crawl. /// </returns> /// <remarks> /// Since it is possible for a url that belongs to a non-greek domain (e.g. .com) to /// contain greek content, all Urls that do not belong to the .gr domain will get the /// value of <see cref="DomainFlagValue.Unknown"/>. This allows the system to assign /// them at a later time to a client who will visit them and check their content-type /// encoding, in order to determine whether they are of interest to the system. /// </remarks> public DomainFlagValue FilterUrl(ref string targetUrl) { DomainFlagValue retVal = DomainFlagValue.Unknown; try { mutex.WaitOne(); string targetHost = InternetUtils.HostName(targetUrl); if (targetHost.EndsWith(".gr") || targetHost.Contains("ebay.com")) { retVal = DomainFlagValue.MustVisit; } else { if (IsIPAddress(ref targetHost)) { if (ipTable.GetCountry(targetHost) == "GR") { retVal = DomainFlagValue.MustVisit; } } } } catch (Exception e) { globals.FileLog.LogWarning(e.ToString()); } finally { mutex.ReleaseMutex(); } return(retVal); }
/// <summary> /// Constructs a new <see cref="InternetUrlToIndex"/> object from an existing <see cref="InternetUrlToIndex"/> object. /// </summary> /// <param name="IUrl">The existing <see cref="InternetUrlToIndex"/> object</param> public InternetUrlToIndex(InternetUrlToIndex IUrl) { ID = IUrl.ID; Url = IUrl.Url; //m_UrlMD5=IUrlVal.MD5; flagRobots = IUrl.FlagRobots; flagDomain = IUrl.FlagDomain; priority = IUrl.Priority; }
/// <summary> /// Constructs a new <see cref="InternetUrlToCrawl"/> object from an existing <see cref="InternetUrlToCrawl"/> object. /// </summary> /// <param name="IUrl">The existing <see cref="InternetUrlToCrawl"/> object</param> public InternetUrlToCrawl(InternetUrlToCrawl IUrl) { ID = IUrl.ID; Url = IUrl.Url; //m_UrlMD5=IUrlVal.MD5; crc = IUrl.CRC; flagDomain = IUrl.FlagDomain; flagFetchRobots = IUrl.FlagFetchRobots; robotsDisallowedPaths = IUrl.RobotsDisallowedPaths; }
/// <summary> /// Constructs a new instance of the <see cref="InternetUrlToCrawl"/> class. /// </summary> public InternetUrlToCrawl() { ID = 0; //Url=String.Empty; //MD5 = new byte[16]{0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0}; crc = 0L; flagDomain = DomainFlagValue.Unknown; flagFetchRobots = false; robotsDisallowedPaths = String.Empty; }
/// <summary> /// Constructs a new instance of the <see cref="InternetUrl"/> class. /// </summary> public InternetUrlToIndex() { ID = 0; Url = String.Empty; MD5 = new byte[16] { 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 }; flagRobots = false; flagDomain = DomainFlagValue.MustVisit; priority = 255; }
/// <summary> /// Constructs an instance of the <see cref="UrlCrawlData"/> class and initializes /// it with the default values. /// </summary> public UrlCrawlData() { url = new InternetUrlToCrawl(); updated = false; redirected = false; redirectedFlagRobots = false; redirectedFlagDomain = DomainFlagValue.MustVisit; redirectedPriority = 255; httpStatusCode = HttpStatusCode.OK; data = String.Empty; timeStamp = DateTime.UtcNow; retrievalTime = 0; outLinks = null;//new ArrayList(); }
/// <summary> /// Constructs a new instance of the <see cref="InternetUrlToIndex"/> class that /// points to a given Url and is associated with a given Identifier. All the /// parameters of the newly created instance are supplied and initialized. /// </summary> /// <param name="ID">The unique identifier associated with this <see cref="InternetUrlToIndex"/></param> /// <param name="url">The Url that this object encapsulates</param> /// <param name="md5">The MD5 Hash Code for this <see cref="InternetUrlToIndex"/> object</param> /// <param name="flagRobots">A boolean flag indicating whether it's allowed to visit the Url</param> /// <param name="flagDomain">A <see cref="DomainFlagValue"/> flag indicating whether the Url belongs to the domain we wish to visit</param> /// <param name="priority">A <see cref="System.Byte"/> flag indicating the url's crawling priority</param> /// <exception cref="CWMalformedUrlException"> /// Thrown if the supplied string value is not a valid Url. /// </exception> public InternetUrlToIndex(int ID, string url, byte[] md5, bool flagRobots, DomainFlagValue flagDomain, byte priority) { Url = url; //this may throw an exception this.ID = ID; // try // { // Uri tmp = new Uri(UrlVal); //this may throw an exception // Url = UrlVal; // } // catch // { // throw new CWMalformedUrlException("The address " + UrlVal + " is not a valid Uri."); // } MD5 = md5; this.flagRobots = flagRobots; this.flagDomain = flagDomain; this.priority = priority; }
/// <summary> /// Constructs a new instance of the <see cref="InternetUrlToIndex"/> class that /// points to a given Url and is associated with a given Identifier. Assumes that /// it's allowed to visit this Url (the <see cref="FlagRobots"/> property is set /// to false) and that it belongs to the domain we wish to visit. /// </summary> /// <param name="ID">The unique identifier associated with this <see cref="InternetUrlToIndex"/></param> /// <param name="url">The Url that this object encapsulates</param> /// <exception cref="CWMalformedUrlException"> /// Thrown if the supplied string value is not a valid Url. /// </exception> public InternetUrlToIndex(int ID, string url) { Url = url; //this may throw an exception this.ID = ID; // try // { // Uri tmp = new Uri(UrlVal); //this may throw an exception // Url = UrlVal; // } // catch // { // throw new CWMalformedUrlException("The address " + UrlVal + " is not a valid Uri."); // } //m_UrlMD5=MD5Hash.md5(UrlVal); flagRobots = false; flagDomain = DomainFlagValue.MustVisit; priority = 255; }
/// <summary> /// Constructs a new instance of the <see cref="InternetUrlToCrawl"/> class that /// points to a given Url and is associated with a given Identifier. All of the /// parameters of the newly created instance are supplied and initialized. /// </summary> /// <param name="ID">The unique identifier associated with this <see cref="InternetUrl"/>.</param> /// <param name="url">The Url that this object encapsulates.</param> /// <param name="md5">The MD5 Hash Code for this <see cref="InternetUrl"/> object.</param> /// <param name="crc">The Cyclic Redundancy Check value associated with the <see cref="InternetUrl"/> object.</param> /// <param name="flagDomain">A <see cref="DomainFlagValue"/> flag indicating whether the crawler must check the country of origin of the contents of the Url.</param> /// <param name="flagFetchRobots">A boolean flag indicating whether a new copy of the robots.txt file related with this Url must be downloaded.</param> /// <param name="robotsDisallowedPaths">A string containing the space-delimited paths on a host that the crawler must not visit.</param> /// <exception cref="CWMalformedUrlException"> /// Thrown if the supplied string value is not a valid Uri. /// </exception> public InternetUrlToCrawl(int ID, string url, byte[] md5, long crc, DomainFlagValue flagDomain, bool flagFetchRobots, string robotsDisallowedPaths) { Url = url; //this may throw an exception this.ID = ID; // try // { // Uri tmp = new Uri(UrlVal); //this may throw an exception // Url = UrlVal; // } // catch // { // throw new CWMalformedUrlException("The address " + UrlVal + " is not a valid Uri."); // } MD5 = md5; this.crc = crc; this.flagDomain = flagDomain; this.flagFetchRobots = flagFetchRobots; this.robotsDisallowedPaths = robotsDisallowedPaths; }
/// <summary> /// Constructs a new instance of the <see cref="InternetUrlToCrawl"/> class that /// points to a given Url and is associated with a given Identifier. /// </summary> /// <param name="ID">The unique identifier associated with this <see cref="InternetUrlToCrawl"/>.</param> /// <param name="url">The Url that this object encapsulates.</param> /// <exception cref="CWMalformedUrlException"> /// Thrown if the supplied string value is not a valid Uri. /// </exception> public InternetUrlToCrawl(int ID, string url) { Url = url; //this may throw an exception this.ID = ID; // try // { // Uri tmp = new Uri(UrlVal); //this may throw an exception // Url = UrlVal; // } // catch // { // throw new CWMalformedUrlException("The address " + UrlVal + " is not a valid Uri."); // } //m_UrlMD5=MD5Hash.md5(UrlVal); crc = 0L; flagDomain = DomainFlagValue.Unknown; flagFetchRobots = false; robotsDisallowedPaths = String.Empty; }
/// <summary> /// Attempts to extract the appropriate FlagDomain value from the contents of the document. /// </summary> /// <param name="content">The HTML content that must be parsed for Domain Flag value.</param> /// <returns>A <see cref="DomainFlagValue"/> indicating whether the content of the /// text document is in the language that interests us. /// </returns> private DomainFlagValue ExtractDomainFlag(ref string content) { DomainFlagValue retVal = DomainFlagValue.Unknown; try { //attempt to find any greek character in the contents of the document. if (content.IndexOfAny(greekChars) != -1) { //a greek character was found, assume the content is in greek retVal = DomainFlagValue.MustVisit; } else { retVal = DomainFlagValue.MustNotVisit; } } catch {} return(retVal); }
/// <summary> /// Constructs an instance of the <see cref="UrlCrawlData"/> class and initializes /// it with the default values. /// </summary> public UrlCrawlData() { url=new InternetUrlToCrawl(); updated=false; redirected=false; redirectedFlagRobots = false; redirectedFlagDomain = DomainFlagValue.MustVisit; redirectedPriority = 255; httpStatusCode=HttpStatusCode.OK; data=String.Empty; timeStamp=DateTime.UtcNow; retrievalTime=0; outLinks=null;//new ArrayList(); }