コード例 #1
0
        //TODO can we also extract title for this format?

        public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc,
                                      StringBuilder docBuf, ParsePathType pathType)
        {
            int mark = 0; // that much is skipped
                          // optionally skip some of the text, set date (no title?)
            DateTime?date = null;
            int      h1   = docBuf.IndexOf(TEXT, StringComparison.Ordinal);

            if (h1 >= 0)
            {
                int h2 = docBuf.IndexOf(TEXT_END, h1, StringComparison.Ordinal);
                mark = h1 + TEXT_LENGTH;
                // date...
                string dateStr = Extract(docBuf, DATE, DATE_END, h2, DATE_NOISE_PREFIXES);
                if (dateStr != null)
                {
                    dateStr = StripTags(dateStr, 0).ToString();
                    date    = trecSrc.ParseDate(dateStr.Trim());
                }
            }
            docData.Clear();
            docData.Name = name;
            docData.SetDate(date);
            docData.Body = StripTags(docBuf, mark).ToString();
            return(docData);
        }
コード例 #2
0
        public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc,
                                      StringBuilder docBuf, ParsePathType pathType)
        {
            int mark = 0; // that much is skipped
                          // optionally skip some of the text, set date, title
            DateTime?date  = null;
            string   title = null;
            int      h1    = docBuf.IndexOf(HEADER);

            if (h1 >= 0)
            {
                int h2 = docBuf.IndexOf(HEADER_END, h1);
                mark = h2 + HEADER_END_LENGTH;
                // date...
                string dateStr = Extract(docBuf, DATE1, DATE1_END, h2, null);
                if (dateStr != null)
                {
                    date = trecSrc.ParseDate(dateStr);
                }
                // title...
                title = Extract(docBuf, TI, TI_END, h2, null);
            }
            docData.Clear();
            docData.Name = name;
            docData.SetDate(date);
            docData.Title = title;
            docData.Body  = StripTags(docBuf, mark).ToString();
            return(docData);
        }
コード例 #3
0
        public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc,
                                      StringBuilder docBuf, ParsePathType pathType)
        {
            int mark = 0; // that much is skipped

            // date...
            DateTime?date    = null;
            string   dateStr = Extract(docBuf, DATE, DATE_END, -1, null);

            if (dateStr != null)
            {
                int d2a = dateStr.IndexOf(DATE_NOISE, StringComparison.Ordinal);
                if (d2a > 0)
                {
                    dateStr = dateStr.Substring(0, (d2a + 3) - 0); // we need the "day" part
                }
                dateStr = StripTags(dateStr, 0).ToString();
                date    = trecSrc.ParseDate(dateStr.Trim());
            }

            // title... first try with SUBJECT, them with HEADLINE
            string title = Extract(docBuf, SUBJECT, SUBJECT_END, -1, null);

            if (title == null)
            {
                title = Extract(docBuf, HEADLINE, HEADLINE_END, -1, null);
            }
            if (title != null)
            {
                title = StripTags(title, 0).ToString().Trim();
            }

            docData.Clear();
            docData.Name = name;
            docData.SetDate(date);
            docData.Title = title;
            docData.Body  = StripTags(docBuf, mark).ToString();
            return(docData);
        }
コード例 #4
0
        public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc,
                                      StringBuilder docBuf, ParsePathType pathType)
        {
            // skip some of the non-html text, optionally set date
            DateTime?date  = null;
            int      start = 0;
            int      h1    = docBuf.IndexOf(DOCHDR, StringComparison.Ordinal);

            if (h1 >= 0)
            {
                int    h2      = docBuf.IndexOf(TERMINATING_DOCHDR, h1, StringComparison.Ordinal);
                string dateStr = Extract(docBuf, DATE, DATE_END, h2, null);
                if (dateStr != null)
                {
                    date = trecSrc.ParseDate(dateStr);
                }
                start = h2 + TERMINATING_DOCHDR.Length;
            }
            string html = docBuf.ToString(start, docBuf.Length - start);

            return(trecSrc.HtmlParser.Parse(docData, name, date, new StringReader(html), trecSrc));
        }
コード例 #5
0
ファイル: TrecFTParser.cs プロジェクト: zhuthree/lucenenet
        public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc,
                                      StringBuilder docBuf, ParsePathType pathType)
        {
            int mark = 0; // that much is skipped

            // date...
            DateTime?date    = null;
            string   dateStr = Extract(docBuf, DATE, DATE_END, -1, null);

            if (dateStr != null)
            {
                date = trecSrc.ParseDate(dateStr);
            }

            // title...
            string title = Extract(docBuf, HEADLINE, HEADLINE_END, -1, null);

            docData.Clear();
            docData.Name = name;
            docData.SetDate(date);
            docData.Title = title;
            docData.Body  = StripTags(docBuf, mark).ToString();
            return(docData);
        }
コード例 #6
0
        public void TestTrecFeedDirAllTypes()
        {
            DirectoryInfo dataDir = CreateTempDir("trecFeedAllTypes");

            using (var stream = GetDataFile("trecdocs.zip"))
                TestUtil.Unzip(stream, dataDir);
            TrecContentSource           tcs   = new TrecContentSource();
            Dictionary <string, string> props = new Dictionary <string, string>();

            props["print.props"]                     = "false";
            props["content.source.verbose"]          = "false";
            props["content.source.excludeIteration"] = "true";
            props["doc.maker.forever"]               = "false";
            props["docs.dir"]               = dataDir.GetCanonicalPath().Replace('\\', '/');
            props["trec.doc.parser"]        = typeof(TrecParserByPath).AssemblyQualifiedName;
            props["content.source.forever"] = "false";
            tcs.SetConfig(new Config(props));
            tcs.ResetInputs();
            DocData dd = new DocData();
            int     n  = 0;
            bool    gotExpectedException = false;
            // LUCENENET specific - skip our UNKNOWN element.
            var pathTypes = ((ParsePathType[])Enum.GetValues(typeof(ParsePathType))).Where(x => x != ParsePathType.UNKNOWN).ToArray();
            HashSet <ParsePathType> unseenTypes = new HashSet <ParsePathType>(pathTypes);

            try
            {
                while (n < 100)
                { // arbiterary limit to prevent looping forever in case of test failure
                    dd = tcs.GetNextDocData(dd);
                    ++n;
                    assertNotNull("doc data " + n + " should not be null!", dd);
                    unseenTypes.Remove(tcs.currPathType);
                    switch (tcs.currPathType)
                    {
                    case ParsePathType.GOV2:
                        assertDocData(dd, "TEST-000", "TEST-000 title", "TEST-000 text", tcs.ParseDate("Sun, 11 Jan 2009 08:00:00 GMT"));
                        break;

                    case ParsePathType.FBIS:
                        assertDocData(dd, "TEST-001", "TEST-001 Title", "TEST-001 text", tcs.ParseDate("1 January 1991"));
                        break;

                    case ParsePathType.FR94:
                        // no title extraction in this source for now
                        assertDocData(dd, "TEST-002", null, "DEPARTMENT OF SOMETHING", tcs.ParseDate("February 3, 1994"));
                        break;

                    case ParsePathType.FT:
                        assertDocData(dd, "TEST-003", "Test-003 title", "Some pub text", tcs.ParseDate("980424"));
                        break;

                    case ParsePathType.LATIMES:
                        assertDocData(dd, "TEST-004", "Test-004 Title", "Some paragraph", tcs.ParseDate("January 17, 1997, Sunday"));
                        break;

                    default:
                        assertTrue("Should never get here!", false);
                        break;
                    }
                }
            }
#pragma warning disable 168
            catch (NoMoreDataException e)
#pragma warning restore 168
            {
                gotExpectedException = true;
            }
            assertTrue("Should have gotten NoMoreDataException!", gotExpectedException);
            assertEquals("Wrong number of documents created by source!", 5, n);
            assertTrue("Did not see all types!", unseenTypes.Count == 0);
        }
コード例 #7
0
ファイル: TrecDocParser.cs プロジェクト: zalintyre/lucenenet
 /// <summary>
 /// Parse the text prepared in docBuf into a result DocData,
 /// no synchronization is required.
 /// </summary>
 /// <param name="docData">Reusable result.</param>
 /// <param name="name">Name that should be set to the result.</param>
 /// <param name="trecSrc">Calling trec content source.</param>
 /// <param name="docBuf">Text to parse.</param>
 /// <param name="pathType">Type of parsed file, or <see cref="ParsePathType.UNKNOWN"/> if unknown - may be used by
 /// parsers to alter their behavior according to the file path type. </param>
 /// <returns></returns>
 public abstract DocData Parse(DocData docData, string name, TrecContentSource trecSrc,
                               StringBuilder docBuf, ParsePathType pathType);
コード例 #8
0
        public virtual DocData Parse(DocData docData, string name, DateTime?date, InputSource source, TrecContentSource trecSrc)
        {
            Parser p = new Parser(source);

            // properties
            IDictionary <string, string> props = p.MetaTags;

            if (props.TryGetValue("date", out string dateStr) && dateStr != null)
            {
                DateTime?newDate = trecSrc.ParseDate(dateStr);
                if (newDate != null)
                {
                    date = newDate;
                }
            }

            docData.Clear();
            docData.Name  = name;
            docData.Body  = p.Body;
            docData.Title = p.Title;
            docData.Props = props;
            docData.SetDate(date);
            return(docData);
        }
コード例 #9
0
 public virtual DocData Parse(DocData docData, string name, DateTime?date, TextReader reader, TrecContentSource trecSrc)
 {
     try
     {
         return(Parse(docData, name, date, new InputSource(reader), trecSrc));
     }
     catch (SAXException saxe)
     {
         throw new IOException("SAX exception occurred while parsing HTML document.", saxe);
     }
 }
コード例 #10
0
 public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc,
                               StringBuilder docBuf, ParsePathType pathType)
 {
     return(pathType2parser[pathType].Parse(docData, name, trecSrc, docBuf, pathType));
 }