public virtual DocData Parse(DocData docData, string name, DateTime?date, InputSource source, TrecContentSource trecSrc) { Parser p = new Parser(source); // properties IDictionary <string, string> props = p.MetaTags; string dateStr; if (props.TryGetValue("date", out dateStr) && dateStr != null) { DateTime?newDate = trecSrc.ParseDate(dateStr); if (newDate != null) { date = newDate; } } docData.Clear(); docData.Name = name; docData.Body = p.Body; docData.Title = p.Title; docData.Props = props; docData.SetDate(date); return(docData); }
//TODO can we also extract title for this format? public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc, StringBuilder docBuf, ParsePathType pathType) { int mark = 0; // that much is skipped // optionally skip some of the text, set date (no title?) DateTime?date = null; int h1 = docBuf.IndexOf(TEXT, StringComparison.Ordinal); if (h1 >= 0) { int h2 = docBuf.IndexOf(TEXT_END, h1, StringComparison.Ordinal); mark = h1 + TEXT_LENGTH; // date... string dateStr = Extract(docBuf, DATE, DATE_END, h2, DATE_NOISE_PREFIXES); if (dateStr != null) { dateStr = StripTags(dateStr, 0).ToString(); date = trecSrc.ParseDate(dateStr.Trim()); } } docData.Clear(); docData.Name = name; docData.SetDate(date); docData.Body = StripTags(docBuf, mark).ToString(); return(docData); }
public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc, StringBuilder docBuf, ParsePathType pathType) { int mark = 0; // that much is skipped // optionally skip some of the text, set date, title DateTime?date = null; string title = null; int h1 = docBuf.IndexOf(HEADER); if (h1 >= 0) { int h2 = docBuf.IndexOf(HEADER_END, h1); mark = h2 + HEADER_END_LENGTH; // date... string dateStr = Extract(docBuf, DATE1, DATE1_END, h2, null); if (dateStr != null) { date = trecSrc.ParseDate(dateStr); } // title... title = Extract(docBuf, TI, TI_END, h2, null); } docData.Clear(); docData.Name = name; docData.SetDate(date); docData.Title = title; docData.Body = StripTags(docBuf, mark).ToString(); return(docData); }
// TODO: we could take param to specify locale... //private readonly RuleBasedNumberFormat rnbf = new RuleBasedNumberFormat(Locale.ROOT, // RuleBasedNumberFormat.SPELLOUT); public override DocData GetNextDocData(DocData docData) { lock (this) { docData.Clear(); // store the current counter to avoid synchronization later on long curCounter; lock (this) { curCounter = counter; if (counter == long.MaxValue) { counter = long.MinValue;//loop around } else { ++counter; } } docData.Body = curCounter.ToWords(); //rnbf.format(curCounter); docData.Name = "doc_" + curCounter.ToString(CultureInfo.InvariantCulture); docData.Title = "title_" + curCounter.ToString(CultureInfo.InvariantCulture); docData.SetDate(new DateTime()); return(docData); } }
public override DocData GetNextDocData(DocData docData) { string[] tuple = parser.Next(); docData.Clear(); docData.Name = tuple[ID]; docData.Body = tuple[BODY]; docData.SetDate(tuple[DATE]); docData.Title = tuple[TITLE]; return(docData); }
public override DocData GetNextDocData(DocData docData) { int id = NewDocID(); AddBytes(DOC_TEXT.Length); docData.Clear(); docData.Name = "doc" + id; docData.Body = DOC_TEXT; return(docData); }
public override DocData GetNextDocData(DocData docData) { FileInfo f = null; string name = null; UninterruptableMonitor.Enter(this); try { if (nextFile >= inputFiles.Count) { // exhausted files, start a new round, unless forever set to false. if (!m_forever) { throw new NoMoreDataException(); } nextFile = 0; iteration++; } f = inputFiles[nextFile++]; name = f.GetCanonicalPath() + "_" + iteration; } finally { UninterruptableMonitor.Exit(this); } using TextReader reader = new StreamReader(new FileStream(f.FullName, FileMode.Open, FileAccess.Read), Encoding.UTF8); // First line is the date, 3rd is the title, rest is body string dateStr = reader.ReadLine(); reader.ReadLine();// skip an empty line string title = reader.ReadLine(); reader.ReadLine();// skip an empty line StringBuilder bodyBuf = new StringBuilder(1024); string line = null; while ((line = reader.ReadLine()) != null) { bodyBuf.Append(line).Append(' '); } reader.Dispose(); AddBytes(f.Length); DateTime?date = ParseDate(dateStr.Trim()); docData.Clear(); docData.Name = name; docData.Body = bodyBuf.ToString(); docData.Title = title; docData.SetDate(date); return(docData); }
public override DocData GetNextDocData(DocData docData) { FileInfo f = null; string name = null; lock (this) { if (!inputFiles.MoveNext()) { // exhausted files, start a new round, unless forever set to false. if (!m_forever) { throw new NoMoreDataException(); } inputFiles = new Enumerator(dataDir); iteration++; } f = inputFiles.Current; // System.err.println(f); name = f.GetCanonicalPath() + "_" + iteration; } string line = null; string dateStr; string title; StringBuilder bodyBuf = new StringBuilder(1024); using (TextReader reader = new StreamReader(new FileStream(f.FullName, FileMode.Open, FileAccess.Read), Encoding.UTF8)) { //First line is the date, 3rd is the title, rest is body dateStr = reader.ReadLine(); reader.ReadLine(); //skip an empty line title = reader.ReadLine(); reader.ReadLine(); //skip an empty line while ((line = reader.ReadLine()) != null) { bodyBuf.Append(line).Append(' '); } } AddBytes(f.Length); DateTime?date = ParseDate(dateStr); docData.Clear(); docData.Name = name; docData.Body = bodyBuf.ToString(); docData.Title = title; docData.SetDate(date); return(docData); }
public override DocData GetNextDocData(DocData docData) { string line; int myID; UninterruptableMonitor.Enter(this); try { line = reader.ReadLine(); if (line is null) { if (!m_forever) { throw new NoMoreDataException(); } // Reset the file OpenFile(); return(GetNextDocData(docData)); } if (docDataLineReader is null) { // first line ever, one time initialization, docDataLineReader = CreateDocDataLineReader(line); if (skipHeaderLine) { return(GetNextDocData(docData)); } } // increment IDS only once... myID = readCount++; } finally { UninterruptableMonitor.Exit(this); } // The date String was written in the format of DateTools.dateToString. docData.Clear(); docData.ID = myID; docDataLineReader.ParseLine(docData, line); return(docData); }
public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc, StringBuilder docBuf, ParsePathType pathType) { int mark = 0; // that much is skipped // date... DateTime?date = null; string dateStr = Extract(docBuf, DATE, DATE_END, -1, null); if (dateStr != null) { int d2a = dateStr.IndexOf(DATE_NOISE, StringComparison.Ordinal); if (d2a > 0) { dateStr = dateStr.Substring(0, (d2a + 3) - 0); // we need the "day" part } dateStr = StripTags(dateStr, 0).ToString(); date = trecSrc.ParseDate(dateStr.Trim()); } // title... first try with SUBJECT, them with HEADLINE string title = Extract(docBuf, SUBJECT, SUBJECT_END, -1, null); if (title == null) { title = Extract(docBuf, HEADLINE, HEADLINE_END, -1, null); } if (title != null) { title = StripTags(title, 0).ToString().Trim(); } docData.Clear(); docData.Name = name; docData.SetDate(date); docData.Title = title; docData.Body = StripTags(docBuf, mark).ToString(); return(docData); }
// TODO: we could take param to specify locale... //private readonly RuleBasedNumberFormat rnbf = new RuleBasedNumberFormat(Locale.ROOT, // RuleBasedNumberFormat.SPELLOUT); public override DocData GetNextDocData(DocData docData) { UninterruptableMonitor.Enter(this); try { docData.Clear(); // store the current counter to avoid synchronization later on long curCounter; UninterruptableMonitor.Enter(this); // LUCENENET TODO: Since the whole method is synchronized, do we need this? try { curCounter = counter; if (counter == long.MaxValue) { counter = long.MinValue;//loop around } else { ++counter; } } finally { UninterruptableMonitor.Exit(this); } docData.Body = curCounter.ToWords(); //rnbf.format(curCounter); docData.Name = "doc_" + curCounter.ToString(CultureInfo.InvariantCulture); docData.Title = "title_" + curCounter.ToString(CultureInfo.InvariantCulture); docData.SetDate(new DateTime()); return(docData); } finally { UninterruptableMonitor.Exit(this); } }
public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc, StringBuilder docBuf, ParsePathType pathType) { int mark = 0; // that much is skipped // date... DateTime?date = null; string dateStr = Extract(docBuf, DATE, DATE_END, -1, null); if (dateStr != null) { date = trecSrc.ParseDate(dateStr); } // title... string title = Extract(docBuf, HEADLINE, HEADLINE_END, -1, null); docData.Clear(); docData.Name = name; docData.SetDate(date); docData.Title = title; docData.Body = StripTags(docBuf, mark).ToString(); return(docData); }