コード例 #1
0
        //TODO can we also extract title for this format?

        public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc,
                                      StringBuilder docBuf, ParsePathType pathType)
        {
            int mark = 0; // that much is skipped
                          // optionally skip some of the text, set date (no title?)
            DateTime?date = null;
            int      h1   = docBuf.IndexOf(TEXT, StringComparison.Ordinal);

            if (h1 >= 0)
            {
                int h2 = docBuf.IndexOf(TEXT_END, h1, StringComparison.Ordinal);
                mark = h1 + TEXT_LENGTH;
                // date...
                string dateStr = Extract(docBuf, DATE, DATE_END, h2, DATE_NOISE_PREFIXES);
                if (dateStr != null)
                {
                    dateStr = StripTags(dateStr, 0).ToString();
                    date    = trecSrc.ParseDate(dateStr.Trim());
                }
            }
            docData.Clear();
            docData.Name = name;
            docData.SetDate(date);
            docData.Body = StripTags(docBuf, mark).ToString();
            return(docData);
        }
コード例 #2
0
        public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc,
                                      StringBuilder docBuf, ParsePathType pathType)
        {
            int mark = 0; // that much is skipped
                          // optionally skip some of the text, set date, title
            DateTime?date  = null;
            string   title = null;
            int      h1    = docBuf.IndexOf(HEADER);

            if (h1 >= 0)
            {
                int h2 = docBuf.IndexOf(HEADER_END, h1);
                mark = h2 + HEADER_END_LENGTH;
                // date...
                string dateStr = Extract(docBuf, DATE1, DATE1_END, h2, null);
                if (dateStr != null)
                {
                    date = trecSrc.ParseDate(dateStr);
                }
                // title...
                title = Extract(docBuf, TI, TI_END, h2, null);
            }
            docData.Clear();
            docData.Name = name;
            docData.SetDate(date);
            docData.Title = title;
            docData.Body  = StripTags(docBuf, mark).ToString();
            return(docData);
        }
コード例 #3
0
        public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc,
                                      StringBuilder docBuf, ParsePathType pathType)
        {
            int mark = 0; // that much is skipped

            // date...
            DateTime?date    = null;
            string   dateStr = Extract(docBuf, DATE, DATE_END, -1, null);

            if (dateStr != null)
            {
                int d2a = dateStr.IndexOf(DATE_NOISE, StringComparison.Ordinal);
                if (d2a > 0)
                {
                    dateStr = dateStr.Substring(0, (d2a + 3) - 0); // we need the "day" part
                }
                dateStr = StripTags(dateStr, 0).ToString();
                date    = trecSrc.ParseDate(dateStr.Trim());
            }

            // title... first try with SUBJECT, them with HEADLINE
            string title = Extract(docBuf, SUBJECT, SUBJECT_END, -1, null);

            if (title == null)
            {
                title = Extract(docBuf, HEADLINE, HEADLINE_END, -1, null);
            }
            if (title != null)
            {
                title = StripTags(title, 0).ToString().Trim();
            }

            docData.Clear();
            docData.Name = name;
            docData.SetDate(date);
            docData.Title = title;
            docData.Body  = StripTags(docBuf, mark).ToString();
            return(docData);
        }
コード例 #4
0
        public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc,
                                      StringBuilder docBuf, ParsePathType pathType)
        {
            // skip some of the non-html text, optionally set date
            DateTime?date  = null;
            int      start = 0;
            int      h1    = docBuf.IndexOf(DOCHDR, StringComparison.Ordinal);

            if (h1 >= 0)
            {
                int    h2      = docBuf.IndexOf(TERMINATING_DOCHDR, h1, StringComparison.Ordinal);
                string dateStr = Extract(docBuf, DATE, DATE_END, h2, null);
                if (dateStr != null)
                {
                    date = trecSrc.ParseDate(dateStr);
                }
                start = h2 + TERMINATING_DOCHDR.Length;
            }
            string html = docBuf.ToString(start, docBuf.Length - start);

            return(trecSrc.HtmlParser.Parse(docData, name, date, new StringReader(html), trecSrc));
        }
コード例 #5
0
ファイル: TrecFTParser.cs プロジェクト: zhuthree/lucenenet
        public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc,
                                      StringBuilder docBuf, ParsePathType pathType)
        {
            int mark = 0; // that much is skipped

            // date...
            DateTime?date    = null;
            string   dateStr = Extract(docBuf, DATE, DATE_END, -1, null);

            if (dateStr != null)
            {
                date = trecSrc.ParseDate(dateStr);
            }

            // title...
            string title = Extract(docBuf, HEADLINE, HEADLINE_END, -1, null);

            docData.Clear();
            docData.Name = name;
            docData.SetDate(date);
            docData.Title = title;
            docData.Body  = StripTags(docBuf, mark).ToString();
            return(docData);
        }
コード例 #6
0
ファイル: TrecDocParser.cs プロジェクト: zalintyre/lucenenet
 /// <summary>
 /// Parse the text prepared in docBuf into a result DocData,
 /// no synchronization is required.
 /// </summary>
 /// <param name="docData">Reusable result.</param>
 /// <param name="name">Name that should be set to the result.</param>
 /// <param name="trecSrc">Calling trec content source.</param>
 /// <param name="docBuf">Text to parse.</param>
 /// <param name="pathType">Type of parsed file, or <see cref="ParsePathType.UNKNOWN"/> if unknown - may be used by
 /// parsers to alter their behavior according to the file path type. </param>
 /// <returns></returns>
 public abstract DocData Parse(DocData docData, string name, TrecContentSource trecSrc,
                               StringBuilder docBuf, ParsePathType pathType);
コード例 #7
0
 public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc,
                               StringBuilder docBuf, ParsePathType pathType)
 {
     return(pathType2parser[pathType].Parse(docData, name, trecSrc, docBuf, pathType));
 }