Exemplo n.º 1
0
        /// <summary>
        /// Check the specified URL for a birth year. This will occur if one sentence
        /// is found that has the word born, and a numeric value less than 3000.
        /// </summary>
        /// <param name="report">Object to report to.</param>
        /// <param name="url">The url.</param>
        /// <param name="desiredYear">The desired year.</param>
        public static void CheckURL(ScanReportable report, Uri url,
                                    int desiredYear)
        {
            int           ch;
            StringBuilder sentence    = new StringBuilder();
            String        ignoreUntil = null;

            WebRequest http = HttpWebRequest.Create(url);

            http.Timeout = 10000;
            HttpWebResponse response = (HttpWebResponse)http.GetResponse();
            Stream          istream  = response.GetResponseStream();
            ParseHTML       html     = new ParseHTML(istream);


            do
            {
                ch = html.Read();
                if ((ch != -1) && (ch != 0) && (ignoreUntil == null))
                {
                    if (".?!".IndexOf((char)ch) != -1)
                    {
                        String str  = sentence.ToString();
                        int    year = Text.ExtractYear(str);

                        if (desiredYear == -1)
                        {
                            // looking for any year
                            if (year != -1)
                            {
                                report.ReceiveGoodSentence(str);
                            }
                        }
                        else
                        {
                            // looking for a specific year
                            if (year == desiredYear)
                            {
                                report.ReceiveGoodSentence(str);
                            }
                            else if (year != -1)
                            {
                                report.ReceiveBadSentence(str);
                            }
                        }
                        sentence.Length = 0;
                    }
                    else if (ch == ' ')
                    {
                        string str = sentence.ToString();
                        if ((sentence.Length > 0) &&
                            (str[str.Length - 1] != ' '))
                        {
                            sentence.Append(' ');
                        }
                    }
                    else if ((ch != '\n') && (ch != '\t') && (ch != '\r'))
                    {
                        if ((ch) < 128)
                        {
                            sentence.Append((char)ch);
                        }
                    }
                }
                else if (ch == 0)
                {
                    // clear anything before a body tag
                    if (html.Tag.Name.Equals("body", StringComparison.CurrentCultureIgnoreCase) ||
                        html.Tag.Name.Equals("br", StringComparison.CurrentCultureIgnoreCase) ||
                        html.Tag.Name.Equals("li", StringComparison.CurrentCultureIgnoreCase) ||
                        html.Tag.Name.Equals("p", StringComparison.CurrentCultureIgnoreCase) ||
                        html.Tag.Name.Equals("h1", StringComparison.CurrentCultureIgnoreCase) ||
                        html.Tag.Name.Equals("h2", StringComparison.CurrentCultureIgnoreCase) ||
                        html.Tag.Name.Equals("h3", StringComparison.CurrentCultureIgnoreCase) ||
                        html.Tag.Name.Equals("td", StringComparison.CurrentCultureIgnoreCase) ||
                        html.Tag.Name.Equals("th", StringComparison.CurrentCultureIgnoreCase))
                    {
                        sentence.Length = 0;
                    }
                    // ignore everything between script and style tags
                    if (ignoreUntil == null)
                    {
                        if (html.Tag.Name.Equals("script", StringComparison.CurrentCultureIgnoreCase))
                        {
                            ignoreUntil = "/script";
                        }
                        else if (html.Tag.Name
                                 .Equals("style", StringComparison.CurrentCultureIgnoreCase))
                        {
                            ignoreUntil = "/style";
                        }
                    }
                    else
                    {
                        if (html.Tag.Name.Equals(ignoreUntil, StringComparison.CurrentCultureIgnoreCase))
                        {
                            ignoreUntil = null;
                        }
                    }

                    // add a space after the tag
                    if (sentence.Length > 0)
                    {
                        string str = sentence.ToString();
                        if (str[str.Length - 1] != ' ')
                        {
                            sentence.Append(' ');
                        }
                    }
                }
            } while (ch != -1);
        }
Exemplo n.º 2
0
        /// <summary>
        /// Check the specified URL for a birth year. This will occur if one sentence
        /// is found that has the word born, and a numeric value less than 3000.
        /// </summary>
        /// <param name="report">Object to report to.</param>
        /// <param name="url">The url.</param>
        /// <param name="desiredYear">The desired year.</param>
        public static void CheckURL(ScanReportable report, Uri url,
                 int desiredYear)
        {
            int ch;
            StringBuilder sentence = new StringBuilder();
            String ignoreUntil = null;

            WebRequest http = HttpWebRequest.Create(url);
            http.Timeout = 10000;
            HttpWebResponse response = (HttpWebResponse)http.GetResponse();            
            Stream istream = response.GetResponseStream();
            ParseHTML html = new ParseHTML(istream);


            do
            {
                ch = html.Read();
                if ((ch != -1) && (ch != 0) && (ignoreUntil == null))
                {
                    if (".?!".IndexOf((char)ch) != -1)
                    {
                        String str = sentence.ToString();
                        int year = Text.ExtractYear(str);

                        if (desiredYear == -1)
                        {
                            // looking for any year
                            if (year != -1)
                            {
                                report.ReceiveGoodSentence(str);
                            }
                        }
                        else
                        {
                            // looking for a specific year
                            if (year == desiredYear)
                            {
                                report.ReceiveGoodSentence(str);
                            }
                            else if (year != -1)
                            {
                                report.ReceiveBadSentence(str);
                            }
                        }
                        sentence.Length = 0;
                    }
                    else if (ch == ' ')
                    {
                        string str = sentence.ToString();
                        if ((sentence.Length > 0)
                                && (str[str.Length - 1] != ' '))
                        {
                            sentence.Append(' ');

                        }
                    }
                    else if ((ch != '\n') && (ch != '\t') && (ch != '\r'))
                    {
                        if ((ch) < 128)
                        {
                            sentence.Append((char)ch);
                        }
                    }
                }
                else if (ch == 0)
                {
                    // clear anything before a body tag
                    if (html.Tag.Name.Equals("body", StringComparison.CurrentCultureIgnoreCase)
                            || html.Tag.Name.Equals("br", StringComparison.CurrentCultureIgnoreCase)
                            || html.Tag.Name.Equals("li", StringComparison.CurrentCultureIgnoreCase)
                            || html.Tag.Name.Equals("p", StringComparison.CurrentCultureIgnoreCase)
                            || html.Tag.Name.Equals("h1", StringComparison.CurrentCultureIgnoreCase)
                            || html.Tag.Name.Equals("h2", StringComparison.CurrentCultureIgnoreCase)
                            || html.Tag.Name.Equals("h3", StringComparison.CurrentCultureIgnoreCase)
                            || html.Tag.Name.Equals("td", StringComparison.CurrentCultureIgnoreCase)
                            || html.Tag.Name.Equals("th", StringComparison.CurrentCultureIgnoreCase))
                    {
                        sentence.Length = 0;
                    }
                    // ignore everything between script and style tags
                    if (ignoreUntil == null)
                    {
                        if (html.Tag.Name.Equals("script", StringComparison.CurrentCultureIgnoreCase))
                        {
                            ignoreUntil = "/script";
                        }
                        else if (html.Tag.Name
                                .Equals("style", StringComparison.CurrentCultureIgnoreCase))
                        {
                            ignoreUntil = "/style";
                        }
                    }
                    else
                    {
                        if (html.Tag.Name.Equals(ignoreUntil, StringComparison.CurrentCultureIgnoreCase))
                        {
                            ignoreUntil = null;
                        }
                    }

                    // add a space after the tag
                    if (sentence.Length > 0)
                    {
                        string str = sentence.ToString();
                        if (str[str.Length - 1] != ' ')
                        {
                            sentence.Append(' ');
                        }
                    }
                }
            } while (ch != -1);

        }