Beispiel #1
0
        public void StartDiscover(string url, Stream readStream, string charset)
        {
            OnDiscoverProgress("Discovering...");

            _baseUri            = new Uri(url);
            _results            = new RSSDiscoverResults();
            _candidateURLs      = new PriorityQueue();
            _candidateHintTexts = new HashMap();
            _candidateURLSet    = new HashSet();

            using (HTMLParser parser = new HTMLParser(OpenHTMLReader(readStream, charset)))
            {
                parser.AddTagHandler("link", new HTMLParser.TagHandler(OnLinkTag));
                parser.AddTagHandler("a", new HTMLParser.TagHandler(OnATag));
                parser.AddTagHandler("/a", new HTMLParser.TagHandler(OnEndATag));
                while (!parser.Finished)
                {
                    string fragment = parser.ReadNextFragment();
                    if (_lastCandidateURL != null)
                    {
                        _candidateHintTexts [_lastCandidateURL] = fragment.Trim();
                        _lastCandidateURL = null;
                    }
                }
                _lastPriority = -1;
                if (_downloadResults)
                {
                    ParseNextCandidate();
                }
            }
        }
Beispiel #2
0
 bool IResourceTextProvider.ProcessResourceText(IResource res, IResourceTextConsumer consumer)
 {
     if (res != null)
     {
         int id = res.Id;
         if (res.Type == _Note)
         {
             string       longBody = res.GetPropText(Core.Props.LongBody);
             StringReader reader   = new StringReader(longBody);
             using (HTMLParser parser = new HTMLParser(reader, true))
             {
                 while (!parser.Finished)
                 {
                     string fragment = parser.ReadNextFragment();
                     if (fragment.Length > 0)
                     {
                         if (parser.InHeading)
                         {
                             consumer.AddDocumentHeading(res.Id, fragment);
                         }
                         else
                         {
                             consumer.AddDocumentFragment(res.Id, fragment);
                         }
                     }
                 }
             }
             consumer.RestartOffsetCounting();
             consumer.AddDocumentHeading(id, res.GetPropText(Core.Props.Subject));
         }
     }
     return(true);
 }
Beispiel #3
0
        public void SimpleBodyNoWordBreak()
        {
            string noBodyHTML = "<HTML><HEAD><BODY>text in body</BODY>text to be ignored</HEAD></HTML>";

            using (HTMLParser parser = new HTMLParser(
                       new StreamReader(new MemoryStream(Encoding.Default.GetBytes(noBodyHTML)))))
            {
                parser.BreakWords = false;

                Assert.AreEqual(false, parser.Finished);
                Assert.AreEqual("text in body", parser.ReadNextFragment());
                Assert.AreEqual(false, parser.Finished);
                Assert.AreEqual("", parser.ReadNextFragment());
                Assert.AreEqual(true, parser.Finished);
            }
        }
Beispiel #4
0
        protected void OnPaste(object sender, EventArgs e)
        {
            if (!_dirty)
            {
                string       body   = _htmled.Html;
                string       title  = "";
                StringReader reader = new StringReader(body);
                using (HTMLParser parser = new HTMLParser(reader, true))
                {
                    while (!parser.Finished)
                    {
                        string fragment = parser.ReadNextFragment();
                        if (fragment.Length > 0)
                        {
                            if (parser.InHeading)
                            {
                                title += fragment;
                            }
                            else
                            if (title.Length > 0)
                            {
                                break;
                            }
                        }
                    }
                }

                if (title.Length > 0)
                {
                    _subject.Text = title;
                }
            }
        }
Beispiel #5
0
        public void CharsetNoWordBreak()
        {
            string HTML = "<HTML><meTa httP-eQuIv=\"Content-Type\" content=\"text/html; cHaRseT=WinDowS-1251\"><BODY>1st frag</BODY></HTML>";

            using (HTMLParser parser = new HTMLParser(
                       new StreamReader(new MemoryStream(Encoding.Default.GetBytes(HTML)))))
            {
                parser.BreakWords = false;
                Assert.AreEqual(false, parser.Finished);
                Assert.AreEqual("1st frag", parser.ReadNextFragment());
                Assert.AreEqual(false, parser.Finished);
                Assert.AreEqual("", parser.ReadNextFragment());
                Assert.AreEqual(true, parser.Finished);
                Assert.AreEqual("windows-1251", parser.CharSet, "Invalid charset!");
            }
        }
Beispiel #6
0
        public void QuotesInTagNoWordBreak()
        {
            string HTML = "<HTML><HEAD><BODY>1st frag<P a=\"aaaa\" b=\"bbbb\"> 2nd frag </BODY></HEAD></HTML>";

            using (HTMLParser parser = new HTMLParser(
                       new StreamReader(new MemoryStream(Encoding.Default.GetBytes(HTML)))))
            {
                parser.BreakWords = false;
                Assert.AreEqual(false, parser.Finished);
                Assert.AreEqual("1st frag", parser.ReadNextFragment());
                Assert.AreEqual(false, parser.Finished);
                Assert.AreEqual(" 2nd frag ", parser.ReadNextFragment());
                Assert.AreEqual(false, parser.Finished);
                Assert.AreEqual("", parser.ReadNextFragment());
                Assert.AreEqual(true, parser.Finished);
            }
        }
Beispiel #7
0
        public void CharEntityReferencesNoWordBreak()
        {
            string HTML = "<body><p>&#x69;&#X6E;&#x63;lude &lt;&#X6C;ist&gt;<p>inclu&#100;&#101; &quot;omniamea.h&quot;<p>#include &laquo;Kama&mdash;Sutra&raquo;</p></body>";

            using (HTMLParser parser = new HTMLParser(
                       new StreamReader(new MemoryStream(Encoding.Default.GetBytes(HTML)))))
            {
                parser.BreakWords = false;
                Assert.AreEqual(false, parser.Finished);
                Assert.AreEqual("include <list>", parser.ReadNextFragment());
                Assert.AreEqual(false, parser.Finished);
                Assert.AreEqual("include \"omniamea.h\"", parser.ReadNextFragment());
                Assert.AreEqual(false, parser.Finished);
                Assert.AreEqual("#include «Kama—Sutra»", parser.ReadNextFragment());
                Assert.AreEqual(false, parser.Finished);
                Assert.AreEqual("", parser.ReadNextFragment());
                Assert.AreEqual(true, parser.Finished);
            }
        }
Beispiel #8
0
        public void ScriptsNoWordBreak()
        {
            string HTML = "<HTML><HEAD><Title>The title</tITLe><script>i = 0</script></HEAD><BODY>1st frag<P a=\"aaaa\" b=\"bbbb\"><script>i = 0</script> 2nd frag </BODY></HTML>";

            using (HTMLParser parser = new HTMLParser(
                       new StreamReader(new MemoryStream(Encoding.Default.GetBytes(HTML)))))
            {
                parser.BreakWords = false;
                Assert.AreEqual(false, parser.Finished);
                Assert.AreEqual("The title", parser.ReadNextFragment());
                Assert.AreEqual(false, parser.Finished);
                Assert.AreEqual("1st frag", parser.ReadNextFragment());
                Assert.AreEqual(false, parser.Finished);
                Assert.AreEqual(" 2nd frag ", parser.ReadNextFragment());
                Assert.AreEqual(false, parser.Finished);
                Assert.AreEqual("", parser.ReadNextFragment());
                Assert.AreEqual(true, parser.Finished);
            }
        }
Beispiel #9
0
        public void NoBody()
        {
            string noBodyHTML = "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\"><HTML><HEAD> text to be ignored </HEAD></HTML>";

            using (HTMLParser parser = CreateParser(noBodyHTML))
            {
                if (parser.ReadNextFragment().Length > 0)
                {
                    throw new Exception("Text outside HTML body is read!");
                }
            }
        }
Beispiel #10
0
        /// <summary>
        /// Invokes parser repeatedly to read all the fragments.
        /// Writes the fragments to a string, separates them with spaces (trailing space is added too!).
        /// </summary>
        /// <param name="parser"></param>
        /// <returns></returns>
        private string ReadAllFragments(HTMLParser parser)
        {
            StringBuilder sb = new StringBuilder();

            while (!parser.Finished)
            {
                sb.Append(parser.ReadNextFragment());
            }
            try
            {
                if (parser.ReadNextFragment().Length != 0)
                {
                    throw new InvalidOperationException("Parser must return an empty fragment having read the whole text (if there's a tag after the last returned meaningful string).");
                }
                throw new InvalidOperationException("Parser must throw an exception if reading beyond end of stream.");
            }
            catch (EndOfStreamException)              // It's expected
            {
            }

            return(sb.ToString());
        }
Beispiel #11
0
        public void Finishing()
        {
            string HTML = "<HTML><HEAD><Title>The title</tITLe></HEAD><BODY>1st frag<P> 2nd frag </BODY></HTML>\n";

            using (HTMLParser parser = new HTMLParser(
                       new StreamReader(new MemoryStream(Encoding.Default.GetBytes(HTML)))))
            {
                while (!parser.Finished)
                {
                    parser.ReadNextFragment();
                }
            }
        }
Beispiel #12
0
        public void FinishingOnOverclosed()
        {
            string HTML = "<HTML><HEAD><Title>The title</</</</</</</a></a></html></head></title>";

            using (HTMLParser parser = new HTMLParser(
                       new StreamReader(new MemoryStream(Encoding.Default.GetBytes(HTML)))))
            {
                int a;
                for (a = 0; (a < 0x1000) && (!parser.Finished); a++)
                {
                    parser.ReadNextFragment();
                }
                if (!(a < 1000))
                {
                    Assert.Fail("The parser has failed to finish.");
                }
            }
        }
Beispiel #13
0
        public void FinishingOnUnclosedNoWordBreak()
        {
            string HTML = "<HTML><HEAD><Title>The title";

            using (HTMLParser parser = new HTMLParser(
                       new StreamReader(new MemoryStream(Encoding.Default.GetBytes(HTML)))))
            {
                parser.BreakWords = false;
                int a;
                for (a = 0; (a < 0x1000) && (!parser.Finished); a++)
                {
                    parser.ReadNextFragment();
                }
                if (!(a < 1000))
                {
                    Assert.Fail("The parser has failed to finish.");
                }
            }
        }
Beispiel #14
0
 private void ProcessResourceStream(IResource resource, IResource source, TextReader reader,
                                    IResourceTextConsumer consumer)
 {
     _currentIndexedRes = resource;
     try
     {
         using (HTMLParser parser = new HTMLParser(reader))
         {
             parser.CloseReader = false;
             parser.AddTagHandler("link", LinkHandler);
             int    docID = resource.Id;
             string fragment;
             while (!parser.Finished)
             {
                 fragment = parser.ReadNextFragment();
                 if (fragment.Length > 0)
                 {
                     if (parser.InHeading)
                     {
                         consumer.AddDocumentHeading(docID, fragment);
                     }
                     else
                     {
                         consumer.AddDocumentFragment(docID, fragment);
                     }
                 }
             }
             // check whether source resource is favorite and has non-empty name property
             // if it hasn't, or has name equyal to URL then set name from the title of HTML stream
             if (source != null && source.Type == "Weblink")
             {
                 IBookmarkService service = (IBookmarkService)Core.PluginLoader.GetPluginService(typeof(IBookmarkService));
                 if (service != null)
                 {
                     string name = source.GetPropText(Core.Props.Name);
                     string url  = string.Empty;
                     if (Core.ResourceStore.PropTypes.Exist("URL"))
                     {
                         url = source.GetPropText("URL");
                         if (url.StartsWith("http://") || url.StartsWith("file://"))
                         {
                             url = url.Substring("http://".Length);
                         }
                         else if (url.StartsWith("ftp://"))
                         {
                             url = url.Substring("ftp://".Length);
                         }
                     }
                     if (url.IndexOfAny(Path.GetInvalidPathChars()) >= 0)
                     {
                         foreach (char invalidChar in Path.GetInvalidPathChars())
                         {
                             url = url.Replace(invalidChar, '-');
                         }
                     }
                     if (name.Length == 0 || url.StartsWith(name))
                     {
                         string title = parser.Title.Trim();
                         if (title.Length > 0)
                         {
                             IBookmarkProfile profile = service.GetOwnerProfile(source);
                             string           error;
                             if (profile != null && profile.CanRename(source, out error))
                             {
                                 profile.Rename(source, title);
                                 service.SetName(source, title);
                             }
                         }
                     }
                 }
             }
         }
     }
     finally
     {
         _currentIndexedRes = null;
     }
 }
Beispiel #15
0
        /// <summary>
        /// Performs indexing of an HTML text for the specified resource, providing that the offsets stored in the text index correspond to the offsets in the source HTML representation.
        /// </summary>
        /// <param name="resourceId">ID of the resource for which the indexing is being performed.</param>
        /// <param name="html">Html text to be indexed.</param>
        /// <param name="consumer">Consumer that would receive the tokens for indexing.</param>
        /// <param name="section">Document section to which the content being indexed belongs, see <see cref="DocumentSection"/> for some possible values. Passing <c>null</c> impplies on the <see cref="DocumentSection.BodySection"/>.</param>
        /// <remarks>
        /// <para>The indexer extracts plaintext contents from the HTML data and passes to the consumer, ensuring that offsets in the indexed content correspond to the offsets in the HTML text.</para>
        /// <para>If you have indexed other sections before, you should restart the offsets counting by calling the <see cref="IResourceTextConsumer.RestartOffsetCounting"/> manually. This function does not assume that offsets should be reset.</para>
        /// </remarks>
        public static void IndexHtml(int resourceId, string html, IResourceTextConsumer consumer, string section)
        {
            if (html == null)
            {
                throw new ArgumentNullException("html", "HTML body must not be null.");
            }

            int nPrependedChars = 0; // Number of characters added to the content by this method

            // Check the section
            if (section == null)
            {
                section = DocumentSection.BodySection;
            }

            // Add a body tag if it's absent, because it's needed for the HTML parser to mark content as body part content
            if (Utils.IndexOf(html, "<html>", true) < 0 || Utils.IndexOf(html, "<body", true) < 0) // Case-insensitive check
            {
                html             = "<html><body>" + html + "</body></html>";                       // Add this stuff. The problem is that we cannot process correctly the HTML fragments that are not equipped with a <body/> tag
                nPrependedChars += "<html><body>".Length;
            }

            using (HTMLParser parser = new HTMLParser(new StringReader(html)))
            {
                // Breaking fragments into words provides that for each word the offset is guaranteed to be valid
                // Otherwise, after the first entity-reference within the block it would have been shifted from the proper value
                parser.BreakWords = true;

                IResourceTextConsumer consumer2 = consumer as IResourceTextConsumer;
                Debug.Assert(consumer2 != null);          // We should succeed (more or less) even if the consumer passed in does not implement the needed interface
                int    nBeforeHtmlWord;                   // Positioned before the current HTML word in the HTML stream
                int    nAfterHtmlWord  = nPrependedChars; // Positioned after the current HTML word in the HTML stream. Seed by positioning after the prepended content
                int    nWordDifference = 0;               // Difference in the length of the HTML and text representation of the current word, given by nAfterHtmlWord - nBeforeHtmlWord - fragment.Length
                string fragment;
                while (!parser.Finished)
                {
                    fragment = parser.ReadNextFragment(out nBeforeHtmlWord);
                    if (fragment.Length > 0)  // Zero-length fragments are completely ignored
                    {
                        // Adjust the offset
                        if
                        (
                            (consumer2 != null) // The consumer is capable of increasing the offset
                            &&
                            (                   // Increment offsets for indexing and context extraction only
                                (consumer.Purpose == TextRequestPurpose.Indexing) ||
                                (consumer.Purpose == TextRequestPurpose.ContextExtraction)
                            ) &&
                            (nBeforeHtmlWord - nAfterHtmlWord + nWordDifference != 0)           // Prevent from making dummy calls
                        )
                        {
                            consumer2.IncrementOffset(nBeforeHtmlWord - nAfterHtmlWord + nWordDifference);   // For nBeforeHtmlWord, we use the current value (for the current word), nAfterHtmlWord and nWordDifference are taken from the previous step and provide for calculating the introduced difference between the text and HTML representations caused by both entities substitution in the word (nWordDifference) and HTML tags skipped in between (nBeforeHtmlWord - nAfterHtmlWord)
                        }
                        // Process next word
                        consumer.AddDocumentFragment(resourceId, fragment, section);

                        // Adjust pointers
                        nAfterHtmlWord  = parser.Position;
                        nWordDifference = nAfterHtmlWord - nBeforeHtmlWord - fragment.Length;
                    }
                }
            }
        }