//TODO can we also extract title for this format? public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc, StringBuilder docBuf, ParsePathType pathType) { int mark = 0; // that much is skipped // optionally skip some of the text, set date (no title?) DateTime?date = null; int h1 = docBuf.IndexOf(TEXT, StringComparison.Ordinal); if (h1 >= 0) { int h2 = docBuf.IndexOf(TEXT_END, h1, StringComparison.Ordinal); mark = h1 + TEXT_LENGTH; // date... string dateStr = Extract(docBuf, DATE, DATE_END, h2, DATE_NOISE_PREFIXES); if (dateStr != null) { dateStr = StripTags(dateStr, 0).ToString(); date = trecSrc.ParseDate(dateStr.Trim()); } } docData.Clear(); docData.Name = name; docData.SetDate(date); docData.Body = StripTags(docBuf, mark).ToString(); return(docData); }
public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc, StringBuilder docBuf, ParsePathType pathType) { int mark = 0; // that much is skipped // optionally skip some of the text, set date, title DateTime?date = null; string title = null; int h1 = docBuf.IndexOf(HEADER); if (h1 >= 0) { int h2 = docBuf.IndexOf(HEADER_END, h1); mark = h2 + HEADER_END_LENGTH; // date... string dateStr = Extract(docBuf, DATE1, DATE1_END, h2, null); if (dateStr != null) { date = trecSrc.ParseDate(dateStr); } // title... title = Extract(docBuf, TI, TI_END, h2, null); } docData.Clear(); docData.Name = name; docData.SetDate(date); docData.Title = title; docData.Body = StripTags(docBuf, mark).ToString(); return(docData); }
public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc, StringBuilder docBuf, ParsePathType pathType) { int mark = 0; // that much is skipped // date... DateTime?date = null; string dateStr = Extract(docBuf, DATE, DATE_END, -1, null); if (dateStr != null) { int d2a = dateStr.IndexOf(DATE_NOISE, StringComparison.Ordinal); if (d2a > 0) { dateStr = dateStr.Substring(0, (d2a + 3) - 0); // we need the "day" part } dateStr = StripTags(dateStr, 0).ToString(); date = trecSrc.ParseDate(dateStr.Trim()); } // title... first try with SUBJECT, them with HEADLINE string title = Extract(docBuf, SUBJECT, SUBJECT_END, -1, null); if (title == null) { title = Extract(docBuf, HEADLINE, HEADLINE_END, -1, null); } if (title != null) { title = StripTags(title, 0).ToString().Trim(); } docData.Clear(); docData.Name = name; docData.SetDate(date); docData.Title = title; docData.Body = StripTags(docBuf, mark).ToString(); return(docData); }
public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc, StringBuilder docBuf, ParsePathType pathType) { // skip some of the non-html text, optionally set date DateTime?date = null; int start = 0; int h1 = docBuf.IndexOf(DOCHDR, StringComparison.Ordinal); if (h1 >= 0) { int h2 = docBuf.IndexOf(TERMINATING_DOCHDR, h1, StringComparison.Ordinal); string dateStr = Extract(docBuf, DATE, DATE_END, h2, null); if (dateStr != null) { date = trecSrc.ParseDate(dateStr); } start = h2 + TERMINATING_DOCHDR.Length; } string html = docBuf.ToString(start, docBuf.Length - start); return(trecSrc.HtmlParser.Parse(docData, name, date, new StringReader(html), trecSrc)); }
public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc, StringBuilder docBuf, ParsePathType pathType) { int mark = 0; // that much is skipped // date... DateTime?date = null; string dateStr = Extract(docBuf, DATE, DATE_END, -1, null); if (dateStr != null) { date = trecSrc.ParseDate(dateStr); } // title... string title = Extract(docBuf, HEADLINE, HEADLINE_END, -1, null); docData.Clear(); docData.Name = name; docData.SetDate(date); docData.Title = title; docData.Body = StripTags(docBuf, mark).ToString(); return(docData); }
public void TestTrecFeedDirAllTypes() { DirectoryInfo dataDir = CreateTempDir("trecFeedAllTypes"); using (var stream = GetDataFile("trecdocs.zip")) TestUtil.Unzip(stream, dataDir); TrecContentSource tcs = new TrecContentSource(); Dictionary <string, string> props = new Dictionary <string, string>(); props["print.props"] = "false"; props["content.source.verbose"] = "false"; props["content.source.excludeIteration"] = "true"; props["doc.maker.forever"] = "false"; props["docs.dir"] = dataDir.GetCanonicalPath().Replace('\\', '/'); props["trec.doc.parser"] = typeof(TrecParserByPath).AssemblyQualifiedName; props["content.source.forever"] = "false"; tcs.SetConfig(new Config(props)); tcs.ResetInputs(); DocData dd = new DocData(); int n = 0; bool gotExpectedException = false; // LUCENENET specific - skip our UNKNOWN element. var pathTypes = ((ParsePathType[])Enum.GetValues(typeof(ParsePathType))).Where(x => x != ParsePathType.UNKNOWN).ToArray(); HashSet <ParsePathType> unseenTypes = new HashSet <ParsePathType>(pathTypes); try { while (n < 100) { // arbiterary limit to prevent looping forever in case of test failure dd = tcs.GetNextDocData(dd); ++n; assertNotNull("doc data " + n + " should not be null!", dd); unseenTypes.Remove(tcs.currPathType); switch (tcs.currPathType) { case ParsePathType.GOV2: assertDocData(dd, "TEST-000", "TEST-000 title", "TEST-000 text", tcs.ParseDate("Sun, 11 Jan 2009 08:00:00 GMT")); break; case ParsePathType.FBIS: assertDocData(dd, "TEST-001", "TEST-001 Title", "TEST-001 text", tcs.ParseDate("1 January 1991")); break; case ParsePathType.FR94: // no title extraction in this source for now assertDocData(dd, "TEST-002", null, "DEPARTMENT OF SOMETHING", tcs.ParseDate("February 3, 1994")); break; case ParsePathType.FT: assertDocData(dd, "TEST-003", "Test-003 title", "Some pub text", tcs.ParseDate("980424")); break; case ParsePathType.LATIMES: assertDocData(dd, "TEST-004", "Test-004 Title", "Some paragraph", tcs.ParseDate("January 17, 1997, Sunday")); break; default: assertTrue("Should never get here!", false); break; } } } #pragma warning disable 168 catch (NoMoreDataException e) #pragma warning restore 168 { gotExpectedException = true; } assertTrue("Should have gotten NoMoreDataException!", gotExpectedException); assertEquals("Wrong number of documents created by source!", 5, n); assertTrue("Did not see all types!", unseenTypes.Count == 0); }
/// <summary> /// Parse the text prepared in docBuf into a result DocData, /// no synchronization is required. /// </summary> /// <param name="docData">Reusable result.</param> /// <param name="name">Name that should be set to the result.</param> /// <param name="trecSrc">Calling trec content source.</param> /// <param name="docBuf">Text to parse.</param> /// <param name="pathType">Type of parsed file, or <see cref="ParsePathType.UNKNOWN"/> if unknown - may be used by /// parsers to alter their behavior according to the file path type. </param> /// <returns></returns> public abstract DocData Parse(DocData docData, string name, TrecContentSource trecSrc, StringBuilder docBuf, ParsePathType pathType);
public virtual DocData Parse(DocData docData, string name, DateTime?date, InputSource source, TrecContentSource trecSrc) { Parser p = new Parser(source); // properties IDictionary <string, string> props = p.MetaTags; if (props.TryGetValue("date", out string dateStr) && dateStr != null) { DateTime?newDate = trecSrc.ParseDate(dateStr); if (newDate != null) { date = newDate; } } docData.Clear(); docData.Name = name; docData.Body = p.Body; docData.Title = p.Title; docData.Props = props; docData.SetDate(date); return(docData); }
public virtual DocData Parse(DocData docData, string name, DateTime?date, TextReader reader, TrecContentSource trecSrc) { try { return(Parse(docData, name, date, new InputSource(reader), trecSrc)); } catch (SAXException saxe) { throw new IOException("SAX exception occurred while parsing HTML document.", saxe); } }
public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc, StringBuilder docBuf, ParsePathType pathType) { return(pathType2parser[pathType].Parse(docData, name, trecSrc, docBuf, pathType)); }