public virtual DocData Parse(DocData docData, string name, DateTime?date, InputSource source, TrecContentSource trecSrc) { Parser p = new Parser(source); // properties IDictionary <string, string> props = p.MetaTags; string dateStr; if (props.TryGetValue("date", out dateStr) && dateStr != null) { DateTime?newDate = trecSrc.ParseDate(dateStr); if (newDate != null) { date = newDate; } } docData.Clear(); docData.Name = name; docData.Body = p.Body; docData.Title = p.Title; docData.Props = props; docData.SetDate(date); return(docData); }
private void SetDocDataField(DocData docData, int position, string text) { switch (posToF[position]) { case FieldName.NAME: docData.Name = text; break; case FieldName.TITLE: docData.Title = text; break; case FieldName.DATE: docData.SetDate(text); break; case FieldName.BODY: docData.Body = text; break; case FieldName.PROP: var p = docData.Props; if (p == null) { p = new Dictionary <string, string>(); docData.Props = p; } p[m_header[position]] = text; break; } }
public override void ParseLine(DocData docData, string line) { int k1 = 0; int k2 = line.IndexOf(WriteLineDocTask.SEP, k1); if (k2 < 0) { throw RuntimeException.Create("line: [" + line + "] is in an invalid format (missing: separator title::date)!"); } docData.Title = line.Substring(k1, k2 - k1); k1 = k2 + 1; k2 = line.IndexOf(WriteLineDocTask.SEP, k1); if (k2 < 0) { throw RuntimeException.Create("line: [" + line + "] is in an invalid format (missing: separator date::body)!"); } docData.SetDate(line.Substring(k1, k2 - k1)); k1 = k2 + 1; k2 = line.IndexOf(WriteLineDocTask.SEP, k1); if (k2 >= 0) { throw RuntimeException.Create("line: [" + line + "] is in an invalid format (too many separators)!"); } // last one docData.Body = line.Substring(k1); }
// TODO: we could take param to specify locale... //private readonly RuleBasedNumberFormat rnbf = new RuleBasedNumberFormat(Locale.ROOT, // RuleBasedNumberFormat.SPELLOUT); public override DocData GetNextDocData(DocData docData) { lock (this) { docData.Clear(); // store the current counter to avoid synchronization later on long curCounter; lock (this) { curCounter = counter; if (counter == long.MaxValue) { counter = long.MinValue;//loop around } else { ++counter; } } docData.Body = curCounter.ToWords(); //rnbf.format(curCounter); docData.Name = "doc_" + curCounter.ToString(CultureInfo.InvariantCulture); docData.Title = "title_" + curCounter.ToString(CultureInfo.InvariantCulture); docData.SetDate(new DateTime()); return(docData); } }
public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc, StringBuilder docBuf, ParsePathType pathType) { int mark = 0; // that much is skipped // optionally skip some of the text, set date, title DateTime?date = null; string title = null; int h1 = docBuf.IndexOf(HEADER); if (h1 >= 0) { int h2 = docBuf.IndexOf(HEADER_END, h1); mark = h2 + HEADER_END_LENGTH; // date... string dateStr = Extract(docBuf, DATE1, DATE1_END, h2, null); if (dateStr != null) { date = trecSrc.ParseDate(dateStr); } // title... title = Extract(docBuf, TI, TI_END, h2, null); } docData.Clear(); docData.Name = name; docData.SetDate(date); docData.Title = title; docData.Body = StripTags(docBuf, mark).ToString(); return(docData); }
//TODO can we also extract title for this format? public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc, StringBuilder docBuf, ParsePathType pathType) { int mark = 0; // that much is skipped // optionally skip some of the text, set date (no title?) DateTime?date = null; int h1 = docBuf.IndexOf(TEXT, StringComparison.Ordinal); if (h1 >= 0) { int h2 = docBuf.IndexOf(TEXT_END, h1, StringComparison.Ordinal); mark = h1 + TEXT_LENGTH; // date... string dateStr = Extract(docBuf, DATE, DATE_END, h2, DATE_NOISE_PREFIXES); if (dateStr != null) { dateStr = StripTags(dateStr, 0).ToString(); date = trecSrc.ParseDate(dateStr.Trim()); } } docData.Clear(); docData.Name = name; docData.SetDate(date); docData.Body = StripTags(docBuf, mark).ToString(); return(docData); }
/// <summary> /// Same as <see cref="MakeDocument()"/>, only this method creates a document of the /// given size input by <paramref name="size"/>. /// </summary> public virtual Document MakeDocument(int size) { LeftOver lvr = leftovr.Value; if (lvr is null || lvr.DocData is null || lvr.DocData.Body is null || lvr.DocData.Body.Length == 0) { ResetLeftovers(); } DocData docData = GetDocState().docData; DocData dd = (lvr is null ? m_source.GetNextDocData(docData) : lvr.DocData); int cnt = (lvr is null ? 0 : lvr.Count); while (dd.Body is null || dd.Body.Length < size) { DocData dd2 = dd; dd = m_source.GetNextDocData(new DocData()); cnt = 0; dd.Body = (dd2.Body + dd.Body); } Document doc = CreateDocument(dd, size, cnt); if (dd.Body is null || dd.Body.Length == 0) { ResetLeftovers(); }
/// <summary> /// Creates a <see cref="Document"/> object ready for indexing. This method uses the /// <see cref="ContentSource"/> to get the next document from the source, and creates /// a <see cref="Document"/> object from the returned fields. If /// <c>reuseFields</c> was set to <c>true</c>, it will reuse <see cref="Document"/> /// and <see cref="Field"/> instances. /// </summary> /// <returns></returns> public virtual Document MakeDocument() { ResetLeftovers(); DocData docData = m_source.GetNextDocData(GetDocState().docData); Document doc = CreateDocument(docData, 0, -1); return(doc); }
private void assertDocData(DocData dd, String expName, String expTitle, String expBody, String expDate) { assertNotNull(dd); assertEquals(expName, dd.Name); assertEquals(expTitle, dd.Title); assertEquals(expBody, dd.Body); assertEquals(expDate, dd.Date); }
public override DocData GetNextDocData(DocData docData) { int id = NewDocID(); AddBytes(DOC_TEXT.Length); docData.Clear(); docData.Name = "doc" + id; docData.Body = DOC_TEXT; return(docData); }
public override DocData GetNextDocData(DocData docData) { string[] tuple = parser.Next(); docData.Clear(); docData.Name = tuple[ID]; docData.Body = tuple[BODY]; docData.SetDate(tuple[DATE]); docData.Title = tuple[TITLE]; return(docData); }
public override DocData GetNextDocData(DocData docData) { string name = null; StringBuilder docBuf = GetDocBuffer(); TrecDocParser.ParsePathType parsedPathType; // protect reading from the TREC files by multiple threads. The rest of the // method, i.e., parsing the content and returning the DocData can run unprotected. UninterruptableMonitor.Enter(@lock); try { if (reader == null) { OpenNextFile(); } // 1. skip until doc start - required for all TREC formats docBuf.Length = 0; Read(docBuf, DOC, false, false); // save parsedFile for passing trecDataParser after the sync block, in // case another thread will open another file in between. parsedPathType = currPathType; // 2. name - required for all TREC formats docBuf.Length = 0; Read(docBuf, DOCNO, true, false); name = docBuf.ToString(DOCNO.Length, docBuf.IndexOf(TERMINATING_DOCNO, DOCNO.Length, StringComparison.Ordinal) - DOCNO.Length).Trim(); if (!excludeDocnameIteration) { name = name + "_" + iteration; } // 3. read all until end of doc docBuf.Length = 0; Read(docBuf, TERMINATING_DOC, false, true); } finally { UninterruptableMonitor.Exit(@lock); } // count char length of text to be parsed (may be larger than the resulted plain doc body text). AddBytes(docBuf.Length); // This code segment relies on HtmlParser being thread safe. When we get // here, everything else is already private to that thread, so we're safe. docData = trecDocParser.Parse(docData, name, this, docBuf, parsedPathType); AddItem(); return(docData); }
public override DocData GetNextDocData(DocData docData) { FileInfo f = null; string name = null; UninterruptableMonitor.Enter(this); try { if (nextFile >= inputFiles.Count) { // exhausted files, start a new round, unless forever set to false. if (!m_forever) { throw new NoMoreDataException(); } nextFile = 0; iteration++; } f = inputFiles[nextFile++]; name = f.GetCanonicalPath() + "_" + iteration; } finally { UninterruptableMonitor.Exit(this); } using TextReader reader = new StreamReader(new FileStream(f.FullName, FileMode.Open, FileAccess.Read), Encoding.UTF8); // First line is the date, 3rd is the title, rest is body string dateStr = reader.ReadLine(); reader.ReadLine();// skip an empty line string title = reader.ReadLine(); reader.ReadLine();// skip an empty line StringBuilder bodyBuf = new StringBuilder(1024); string line = null; while ((line = reader.ReadLine()) != null) { bodyBuf.Append(line).Append(' '); } reader.Dispose(); AddBytes(f.Length); DateTime?date = ParseDate(dateStr.Trim()); docData.Clear(); docData.Name = name; docData.Body = bodyBuf.ToString(); docData.Title = title; docData.SetDate(date); return(docData); }
private void assertDocData(DocData dd, String expName, String expTitle, String expBody, DateTime?expDate) { assertNotNull(dd); assertEquals(expName, dd.Name); assertEquals(expTitle, dd.Title); assertTrue(dd.Body.IndexOf(expBody) != -1); DateTime?date = dd.Date != null?DateTools.StringToDate(dd.Date) : (DateTime?)null; assertEquals(expDate, date); }
public virtual DocData Parse(DocData docData, string name, DateTime?date, TextReader reader, TrecContentSource trecSrc) { try { return(Parse(docData, name, date, new InputSource(reader), trecSrc)); } catch (SAXException saxe) { throw new IOException("SAX exception occurred while parsing HTML document.", saxe); } }
public override DocData GetNextDocData(DocData docData) { FileInfo f = null; string name = null; lock (this) { if (!inputFiles.MoveNext()) { // exhausted files, start a new round, unless forever set to false. if (!m_forever) { throw new NoMoreDataException(); } inputFiles = new Enumerator(dataDir); iteration++; } f = inputFiles.Current; // System.err.println(f); name = f.GetCanonicalPath() + "_" + iteration; } string line = null; string dateStr; string title; StringBuilder bodyBuf = new StringBuilder(1024); using (TextReader reader = new StreamReader(new FileStream(f.FullName, FileMode.Open, FileAccess.Read), Encoding.UTF8)) { //First line is the date, 3rd is the title, rest is body dateStr = reader.ReadLine(); reader.ReadLine(); //skip an empty line title = reader.ReadLine(); reader.ReadLine(); //skip an empty line while ((line = reader.ReadLine()) != null) { bodyBuf.Append(line).Append(' '); } } AddBytes(f.Length); DateTime?date = ParseDate(dateStr); docData.Clear(); docData.Name = name; docData.Body = bodyBuf.ToString(); docData.Title = title; docData.SetDate(date); return(docData); }
public override void ParseLine(DocData docData, string line) { string[] parts = new Regex("\\t").Split(line, 7);//no more than first 6 fields needed // Sample data line: // 3578267, Morne du Vitet, Morne du Vitet, 17.88333, -62.8, ... // ID, Name, Alternate name (unused), Lat, Lon, ... docData.ID = Convert.ToInt32(parts[0], CultureInfo.InvariantCulture);//note: overwrites ID assigned by LineDocSource docData.Name = parts[1]; string latitude = parts[4]; string longitude = parts[5]; docData.Body = "POINT(" + longitude + " " + latitude + ")";//WKT is x y order }
public void TestOneDocument() { String docs = "<mediawiki>\r\n" + PAGE1 + "</mediawiki>"; EnwikiContentSource source = createContentSource(docs, false); DocData dd = source.GetNextDocData(new DocData()); assertDocData(dd, "1", "Title1", "Some text 1 here", "14-SEP-2011 11:35:09.000"); assertNoMoreDataException(source); }
public void TestForever() { String docs = "<DOC>\r\n" + "<DOCNO>TEST-000</DOCNO>\r\n" + //"<docno>TEST-000</docno>\r\n" + "<DOCHDR>\r\n" + "http://lucene.apache.org.trecdocmaker.test\r\n" + "HTTP/1.1 200 OK\r\n" + "Date: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + "Server: Apache/1.3.27 (Unix)\r\n" + "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + "Content-Length: 614\r\n" + "Connection: close\r\n" + "Content-Type: text/html\r\n" + "</DOCHDR>\r\n" + "<html>\r\n" + "\r\n" + "<head>\r\n" + "<title>\r\n" + "TEST-000 title\r\n" + "</title>\r\n" + "</head>\r\n" + "\r\n" + "<body>\r\n" + "TEST-000 text\r\n" + "\r\n" + "</body>\r\n" + "\r\n" + "</DOC>"; StringableTrecSource source = new StringableTrecSource(docs, true); source.SetConfig(null); DocData dd = source.GetNextDocData(new DocData()); assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", source .ParseDate("Sun, 11 Jan 2009 08:00:00 GMT")); // same document, but the second iteration changes the name. dd = source.GetNextDocData(dd); assertDocData(dd, "TEST-000_1", "TEST-000 title", "TEST-000 text", source .ParseDate("Sun, 11 Jan 2009 08:00:00 GMT")); source.Dispose(); // Don't test that NoMoreDataException is thrown, since the forever flag is // turned on. }
public override DocData GetNextDocData(DocData docData) { if (finish) { throw new NoMoreDataException(); } docData.Body = ("body"); docData.SetDate("date"); docData.Title = ("title"); Dictionary <string, string> props = new Dictionary <string, string>(); props["key"] = "value"; docData.Props = props; finish = true; return(docData); }
protected override Query[] PrepareQueries() { int maxQueries = m_config.Get("query.file.maxQueries", 1000); Config srcConfig = new Config(new Dictionary <string, string>()); srcConfig.Set("docs.file", m_config.Get("query.file", null)); srcConfig.Set("line.parser", m_config.Get("query.file.line.parser", null)); srcConfig.Set("content.source.forever", "false"); JCG.List <Query> queries = new JCG.List <Query>(); LineDocSource src = new LineDocSource(); try { src.SetConfig(srcConfig); src.ResetInputs(); DocData docData = new DocData(); for (int i = 0; i < maxQueries; i++) { docData = src.GetNextDocData(docData); IShape shape = SpatialDocMaker.MakeShapeFromString(m_strategy, docData.Name, docData.Body); if (shape != null) { shape = m_shapeConverter.Convert(shape); queries.Add(MakeQueryFromShape(shape)); } else { i--;//skip } } } #pragma warning disable 168 catch (NoMoreDataException e) #pragma warning restore 168 { //all-done } finally { src.Dispose(); } return(queries.ToArray()); }
public override DocData GetNextDocData(DocData docData) { string line; int myID; UninterruptableMonitor.Enter(this); try { line = reader.ReadLine(); if (line is null) { if (!m_forever) { throw new NoMoreDataException(); } // Reset the file OpenFile(); return(GetNextDocData(docData)); } if (docDataLineReader is null) { // first line ever, one time initialization, docDataLineReader = CreateDocDataLineReader(line); if (skipHeaderLine) { return(GetNextDocData(docData)); } } // increment IDS only once... myID = readCount++; } finally { UninterruptableMonitor.Exit(this); } // The date String was written in the format of DateTools.dateToString. docData.Clear(); docData.ID = myID; docDataLineReader.ParseLine(docData, line); return(docData); }
public void TestOneDocument() { String docs = "<DOC>\r\n" + "<DOCNO>TEST-000</DOCNO>\r\n" + "<DOCHDR>\r\n" + "http://lucene.apache.org.trecdocmaker.test\r\n" + "HTTP/1.1 200 OK\r\n" + "Date: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + "Server: Apache/1.3.27 (Unix)\r\n" + "Last-Modified: Sun, 11 Jan 2009 08:00:00 GMT\r\n" + "Content-Length: 614\r\n" + "Connection: close\r\n" + "Content-Type: text/html\r\n" + "</DOCHDR>\r\n" + "<html>\r\n" + "\r\n" + "<head>\r\n" + "<title>\r\n" + "TEST-000 title\r\n" + "</title>\r\n" + "</head>\r\n" + "\r\n" + "<body>\r\n" + "TEST-000 text\r\n" + "\r\n" + "</body>\r\n" + "\r\n" + "</DOC>"; StringableTrecSource source = new StringableTrecSource(docs, false); source.SetConfig(null); DocData dd = source.GetNextDocData(new DocData()); assertDocData(dd, "TEST-000_0", "TEST-000 title", "TEST-000 text", source .ParseDate("Sun, 11 Jan 2009 08:00:00 GMT")); assertNoMoreDataException(source); }
public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc, StringBuilder docBuf, ParsePathType pathType) { int mark = 0; // that much is skipped // date... DateTime?date = null; string dateStr = Extract(docBuf, DATE, DATE_END, -1, null); if (dateStr != null) { int d2a = dateStr.IndexOf(DATE_NOISE, StringComparison.Ordinal); if (d2a > 0) { dateStr = dateStr.Substring(0, (d2a + 3) - 0); // we need the "day" part } dateStr = StripTags(dateStr, 0).ToString(); date = trecSrc.ParseDate(dateStr.Trim()); } // title... first try with SUBJECT, them with HEADLINE string title = Extract(docBuf, SUBJECT, SUBJECT_END, -1, null); if (title == null) { title = Extract(docBuf, HEADLINE, HEADLINE_END, -1, null); } if (title != null) { title = StripTags(title, 0).ToString().Trim(); } docData.Clear(); docData.Name = name; docData.SetDate(date); docData.Title = title; docData.Body = StripTags(docBuf, mark).ToString(); return(docData); }
public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc, StringBuilder docBuf, ParsePathType pathType) { // skip some of the non-html text, optionally set date DateTime?date = null; int start = 0; int h1 = docBuf.IndexOf(DOCHDR, StringComparison.Ordinal); if (h1 >= 0) { int h2 = docBuf.IndexOf(TERMINATING_DOCHDR, h1, StringComparison.Ordinal); string dateStr = Extract(docBuf, DATE, DATE_END, h2, null); if (dateStr != null) { date = trecSrc.ParseDate(dateStr); } start = h2 + TERMINATING_DOCHDR.Length; } string html = docBuf.ToString(start, docBuf.Length - start); return(trecSrc.HtmlParser.Parse(docData, name, date, new StringReader(html), trecSrc)); }
public override void ParseLine(DocData docData, string line) { int n = 0; int k1 = 0; int k2; while ((k2 = line.IndexOf(WriteLineDocTask.SEP, k1)) >= 0) { if (n >= m_header.Length) { throw RuntimeException.Create("input line has invalid format: " + (n + 1) + " fields instead of " + m_header.Length + " :: [" + line + "]"); } SetDocDataField(docData, n, line.Substring(k1, k2 - k1)); ++n; k1 = k2 + 1; } if (n != m_header.Length - 1) { throw RuntimeException.Create("input line has invalid format: " + (n + 1) + " fields instead of " + m_header.Length + " :: [" + line + "]"); } // last one SetDocDataField(docData, n, line.Substring(k1)); }
public override DocData GetNextDocData(DocData docData) { docData = base.GetNextDocData(docData); var props = new Dictionary <string, string>(); // random int props["sort_field"] = r.Next(sortRange).ToString(CultureInfo.InvariantCulture); // random string int len = NextInt32(2, 20); char[] buffer = new char[len]; for (int i = 0; i < len; i++) { buffer[i] = (char)r.Next(0x80); } props["random_string"] = new string(buffer); // random country props["country"] = COUNTRIES[r.Next(COUNTRIES.Length)]; docData.Props = props; return(docData); }
public void TestForever() { String docs = "<mediawiki>\r\n" + PAGE1 + PAGE2 + "</mediawiki>"; EnwikiContentSource source = createContentSource(docs, true); // same documents several times for (int i = 0; i < 3; i++) { DocData dd1 = source.GetNextDocData(new DocData()); assertDocData(dd1, "1", "Title1", "Some text 1 here", "14-SEP-2011 11:35:09.000"); DocData dd2 = source.GetNextDocData(new DocData()); assertDocData(dd2, "2", "Title2", "Some text 2 here", "14-SEP-2022 22:35:09.000"); // Don't test that NoMoreDataException is thrown, since the forever flag is turned on. } source.Dispose(); }
// TODO: we could take param to specify locale... //private readonly RuleBasedNumberFormat rnbf = new RuleBasedNumberFormat(Locale.ROOT, // RuleBasedNumberFormat.SPELLOUT); public override DocData GetNextDocData(DocData docData) { UninterruptableMonitor.Enter(this); try { docData.Clear(); // store the current counter to avoid synchronization later on long curCounter; UninterruptableMonitor.Enter(this); // LUCENENET TODO: Since the whole method is synchronized, do we need this? try { curCounter = counter; if (counter == long.MaxValue) { counter = long.MinValue;//loop around } else { ++counter; } } finally { UninterruptableMonitor.Exit(this); } docData.Body = curCounter.ToWords(); //rnbf.format(curCounter); docData.Name = "doc_" + curCounter.ToString(CultureInfo.InvariantCulture); docData.Title = "title_" + curCounter.ToString(CultureInfo.InvariantCulture); docData.SetDate(new DateTime()); return(docData); } finally { UninterruptableMonitor.Exit(this); } }
// LUCENENET specific: de-nested IShapeConverter public override Document MakeDocument() { DocState docState = GetDocState(); Document doc = base.MakeDocument(); // Set SPATIAL_FIELD from body DocData docData = docState.docData; // makeDocument() resets docState.getBody() so we can't look there; look in Document string shapeStr = doc.GetField(DocMaker.BODY_FIELD).GetStringValue(); IShape shape = MakeShapeFromString(strategy, docData.Name, shapeStr); if (shape != null) { shape = shapeConverter.Convert(shape); //index foreach (Field f in strategy.CreateIndexableFields(shape)) { doc.Add(f); } } return(doc); }