//TODO can we also extract title for this format? public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc, StringBuilder docBuf, ParsePathType pathType) { int mark = 0; // that much is skipped // optionally skip some of the text, set date (no title?) DateTime?date = null; int h1 = docBuf.IndexOf(TEXT, StringComparison.Ordinal); if (h1 >= 0) { int h2 = docBuf.IndexOf(TEXT_END, h1, StringComparison.Ordinal); mark = h1 + TEXT_LENGTH; // date... string dateStr = Extract(docBuf, DATE, DATE_END, h2, DATE_NOISE_PREFIXES); if (dateStr != null) { dateStr = StripTags(dateStr, 0).ToString(); date = trecSrc.ParseDate(dateStr.Trim()); } } docData.Clear(); docData.Name = name; docData.SetDate(date); docData.Body = StripTags(docBuf, mark).ToString(); return(docData); }
public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc, StringBuilder docBuf, ParsePathType pathType) { int mark = 0; // that much is skipped // optionally skip some of the text, set date, title DateTime?date = null; string title = null; int h1 = docBuf.IndexOf(HEADER); if (h1 >= 0) { int h2 = docBuf.IndexOf(HEADER_END, h1); mark = h2 + HEADER_END_LENGTH; // date... string dateStr = Extract(docBuf, DATE1, DATE1_END, h2, null); if (dateStr != null) { date = trecSrc.ParseDate(dateStr); } // title... title = Extract(docBuf, TI, TI_END, h2, null); } docData.Clear(); docData.Name = name; docData.SetDate(date); docData.Title = title; docData.Body = StripTags(docBuf, mark).ToString(); return(docData); }
public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc, StringBuilder docBuf, ParsePathType pathType) { int mark = 0; // that much is skipped // date... DateTime?date = null; string dateStr = Extract(docBuf, DATE, DATE_END, -1, null); if (dateStr != null) { int d2a = dateStr.IndexOf(DATE_NOISE, StringComparison.Ordinal); if (d2a > 0) { dateStr = dateStr.Substring(0, (d2a + 3) - 0); // we need the "day" part } dateStr = StripTags(dateStr, 0).ToString(); date = trecSrc.ParseDate(dateStr.Trim()); } // title... first try with SUBJECT, them with HEADLINE string title = Extract(docBuf, SUBJECT, SUBJECT_END, -1, null); if (title == null) { title = Extract(docBuf, HEADLINE, HEADLINE_END, -1, null); } if (title != null) { title = StripTags(title, 0).ToString().Trim(); } docData.Clear(); docData.Name = name; docData.SetDate(date); docData.Title = title; docData.Body = StripTags(docBuf, mark).ToString(); return(docData); }
public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc, StringBuilder docBuf, ParsePathType pathType) { // skip some of the non-html text, optionally set date DateTime?date = null; int start = 0; int h1 = docBuf.IndexOf(DOCHDR, StringComparison.Ordinal); if (h1 >= 0) { int h2 = docBuf.IndexOf(TERMINATING_DOCHDR, h1, StringComparison.Ordinal); string dateStr = Extract(docBuf, DATE, DATE_END, h2, null); if (dateStr != null) { date = trecSrc.ParseDate(dateStr); } start = h2 + TERMINATING_DOCHDR.Length; } string html = docBuf.ToString(start, docBuf.Length - start); return(trecSrc.HtmlParser.Parse(docData, name, date, new StringReader(html), trecSrc)); }
public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc, StringBuilder docBuf, ParsePathType pathType) { int mark = 0; // that much is skipped // date... DateTime?date = null; string dateStr = Extract(docBuf, DATE, DATE_END, -1, null); if (dateStr != null) { date = trecSrc.ParseDate(dateStr); } // title... string title = Extract(docBuf, HEADLINE, HEADLINE_END, -1, null); docData.Clear(); docData.Name = name; docData.SetDate(date); docData.Title = title; docData.Body = StripTags(docBuf, mark).ToString(); return(docData); }
/// <summary> /// Parse the text prepared in docBuf into a result DocData, /// no synchronization is required. /// </summary> /// <param name="docData">Reusable result.</param> /// <param name="name">Name that should be set to the result.</param> /// <param name="trecSrc">Calling trec content source.</param> /// <param name="docBuf">Text to parse.</param> /// <param name="pathType">Type of parsed file, or <see cref="ParsePathType.UNKNOWN"/> if unknown - may be used by /// parsers to alter their behavior according to the file path type. </param> /// <returns></returns> public abstract DocData Parse(DocData docData, string name, TrecContentSource trecSrc, StringBuilder docBuf, ParsePathType pathType);
public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc, StringBuilder docBuf, ParsePathType pathType) { return(pathType2parser[pathType].Parse(docData, name, trecSrc, docBuf, pathType)); }