Esempio n. 1
0
        //TODO can we also extract title for this format?

        public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc,
                                      StringBuilder docBuf, ParsePathType pathType)
        {
            int mark = 0; // that much is skipped
                          // optionally skip some of the text, set date (no title?)
            DateTime?date = null;
            int      h1   = docBuf.IndexOf(TEXT, StringComparison.Ordinal);

            if (h1 >= 0)
            {
                int h2 = docBuf.IndexOf(TEXT_END, h1, StringComparison.Ordinal);
                mark = h1 + TEXT_LENGTH;
                // date...
                string dateStr = Extract(docBuf, DATE, DATE_END, h2, DATE_NOISE_PREFIXES);
                if (dateStr != null)
                {
                    dateStr = StripTags(dateStr, 0).ToString();
                    date    = trecSrc.ParseDate(dateStr.Trim());
                }
            }
            docData.Clear();
            docData.Name = name;
            docData.SetDate(date);
            docData.Body = StripTags(docBuf, mark).ToString();
            return(docData);
        }
Esempio n. 2
0
        public override void ParseLine(DocData docData, string line)
        {
            int k1 = 0;
            int k2 = line.IndexOf(WriteLineDocTask.SEP, k1);

            if (k2 < 0)
            {
                throw RuntimeException.Create("line: [" + line + "] is in an invalid format (missing: separator title::date)!");
            }
            docData.Title = line.Substring(k1, k2 - k1);
            k1            = k2 + 1;
            k2            = line.IndexOf(WriteLineDocTask.SEP, k1);
            if (k2 < 0)
            {
                throw RuntimeException.Create("line: [" + line + "] is in an invalid format (missing: separator date::body)!");
            }
            docData.SetDate(line.Substring(k1, k2 - k1));
            k1 = k2 + 1;
            k2 = line.IndexOf(WriteLineDocTask.SEP, k1);
            if (k2 >= 0)
            {
                throw RuntimeException.Create("line: [" + line + "] is in an invalid format (too many separators)!");
            }
            // last one
            docData.Body = line.Substring(k1);
        }
Esempio n. 3
0
        private void SetDocDataField(DocData docData, int position, string text)
        {
            switch (posToF[position])
            {
            case FieldName.NAME:
                docData.Name = text;
                break;

            case FieldName.TITLE:
                docData.Title = text;
                break;

            case FieldName.DATE:
                docData.SetDate(text);
                break;

            case FieldName.BODY:
                docData.Body = text;
                break;

            case FieldName.PROP:
                var p = docData.Props;
                if (p == null)
                {
                    p             = new Dictionary <string, string>();
                    docData.Props = p;
                }
                p[m_header[position]] = text;
                break;
            }
        }
Esempio n. 4
0
        public virtual DocData Parse(DocData docData, string name, DateTime?date, InputSource source, TrecContentSource trecSrc)
        {
            Parser p = new Parser(source);

            // properties
            IDictionary <string, string> props = p.MetaTags;
            string dateStr;

            if (props.TryGetValue("date", out dateStr) && dateStr != null)
            {
                DateTime?newDate = trecSrc.ParseDate(dateStr);
                if (newDate != null)
                {
                    date = newDate;
                }
            }

            docData.Clear();
            docData.Name  = name;
            docData.Body  = p.Body;
            docData.Title = p.Title;
            docData.Props = props;
            docData.SetDate(date);
            return(docData);
        }
Esempio n. 5
0
        public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc,
                                      StringBuilder docBuf, ParsePathType pathType)
        {
            int mark = 0; // that much is skipped
                          // optionally skip some of the text, set date, title
            DateTime?date  = null;
            string   title = null;
            int      h1    = docBuf.IndexOf(HEADER);

            if (h1 >= 0)
            {
                int h2 = docBuf.IndexOf(HEADER_END, h1);
                mark = h2 + HEADER_END_LENGTH;
                // date...
                string dateStr = Extract(docBuf, DATE1, DATE1_END, h2, null);
                if (dateStr != null)
                {
                    date = trecSrc.ParseDate(dateStr);
                }
                // title...
                title = Extract(docBuf, TI, TI_END, h2, null);
            }
            docData.Clear();
            docData.Name = name;
            docData.SetDate(date);
            docData.Title = title;
            docData.Body  = StripTags(docBuf, mark).ToString();
            return(docData);
        }
        // TODO: we could take param to specify locale...
        //private readonly RuleBasedNumberFormat rnbf = new RuleBasedNumberFormat(Locale.ROOT,
        //                                                                     RuleBasedNumberFormat.SPELLOUT);
        public override DocData GetNextDocData(DocData docData)
        {
            lock (this)
            {
                docData.Clear();
                // store the current counter to avoid synchronization later on
                long curCounter;
                lock (this)
                {
                    curCounter = counter;
                    if (counter == long.MaxValue)
                    {
                        counter = long.MinValue;//loop around
                    }
                    else
                    {
                        ++counter;
                    }
                }

                docData.Body  = curCounter.ToWords(); //rnbf.format(curCounter);
                docData.Name  = "doc_" + curCounter.ToString(CultureInfo.InvariantCulture);
                docData.Title = "title_" + curCounter.ToString(CultureInfo.InvariantCulture);
                docData.SetDate(new DateTime());
                return(docData);
            }
        }
Esempio n. 7
0
 public override DocData GetNextDocData(DocData docData)
 {
     string[] tuple = parser.Next();
     docData.Clear();
     docData.Name = tuple[ID];
     docData.Body = tuple[BODY];
     docData.SetDate(tuple[DATE]);
     docData.Title = tuple[TITLE];
     return(docData);
 }
Esempio n. 8
0
        public override DocData GetNextDocData(DocData docData)
        {
            FileInfo f    = null;
            string   name = null;

            UninterruptableMonitor.Enter(this);
            try
            {
                if (nextFile >= inputFiles.Count)
                {
                    // exhausted files, start a new round, unless forever set to false.
                    if (!m_forever)
                    {
                        throw new NoMoreDataException();
                    }
                    nextFile = 0;
                    iteration++;
                }
                f    = inputFiles[nextFile++];
                name = f.GetCanonicalPath() + "_" + iteration;
            }
            finally
            {
                UninterruptableMonitor.Exit(this);
            }

            using TextReader reader = new StreamReader(new FileStream(f.FullName, FileMode.Open, FileAccess.Read), Encoding.UTF8);
            // First line is the date, 3rd is the title, rest is body
            string dateStr = reader.ReadLine();

            reader.ReadLine();// skip an empty line
            string title = reader.ReadLine();

            reader.ReadLine();// skip an empty line
            StringBuilder bodyBuf = new StringBuilder(1024);
            string        line    = null;

            while ((line = reader.ReadLine()) != null)
            {
                bodyBuf.Append(line).Append(' ');
            }
            reader.Dispose();


            AddBytes(f.Length);

            DateTime?date = ParseDate(dateStr.Trim());

            docData.Clear();
            docData.Name  = name;
            docData.Body  = bodyBuf.ToString();
            docData.Title = title;
            docData.SetDate(date);
            return(docData);
        }
Esempio n. 9
0
        public override DocData GetNextDocData(DocData docData)
        {
            FileInfo f    = null;
            string   name = null;

            lock (this)
            {
                if (!inputFiles.MoveNext())
                {
                    // exhausted files, start a new round, unless forever set to false.
                    if (!m_forever)
                    {
                        throw new NoMoreDataException();
                    }
                    inputFiles = new Enumerator(dataDir);
                    iteration++;
                }
                f = inputFiles.Current;
                // System.err.println(f);
                name = f.GetCanonicalPath() + "_" + iteration;
            }

            string        line = null;
            string        dateStr;
            string        title;
            StringBuilder bodyBuf = new StringBuilder(1024);

            using (TextReader reader = new StreamReader(new FileStream(f.FullName, FileMode.Open, FileAccess.Read), Encoding.UTF8))
            {
                //First line is the date, 3rd is the title, rest is body
                dateStr = reader.ReadLine();
                reader.ReadLine(); //skip an empty line
                title = reader.ReadLine();
                reader.ReadLine(); //skip an empty line
                while ((line = reader.ReadLine()) != null)
                {
                    bodyBuf.Append(line).Append(' ');
                }
            }
            AddBytes(f.Length);

            DateTime?date = ParseDate(dateStr);

            docData.Clear();
            docData.Name  = name;
            docData.Body  = bodyBuf.ToString();
            docData.Title = title;
            docData.SetDate(date);
            return(docData);
        }
Esempio n. 10
0
            public override DocData GetNextDocData(DocData docData)
            {
                if (finish)
                {
                    throw new NoMoreDataException();
                }

                docData.Body = ("body");
                docData.SetDate("date");
                docData.Title = ("title");
                Dictionary <string, string> props = new Dictionary <string, string>();

                props["key"]  = "value";
                docData.Props = props;
                finish        = true;

                return(docData);
            }
Esempio n. 11
0
        public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc,
                                      StringBuilder docBuf, ParsePathType pathType)
        {
            int mark = 0; // that much is skipped

            // date...
            DateTime?date    = null;
            string   dateStr = Extract(docBuf, DATE, DATE_END, -1, null);

            if (dateStr != null)
            {
                int d2a = dateStr.IndexOf(DATE_NOISE, StringComparison.Ordinal);
                if (d2a > 0)
                {
                    dateStr = dateStr.Substring(0, (d2a + 3) - 0); // we need the "day" part
                }
                dateStr = StripTags(dateStr, 0).ToString();
                date    = trecSrc.ParseDate(dateStr.Trim());
            }

            // title... first try with SUBJECT, them with HEADLINE
            string title = Extract(docBuf, SUBJECT, SUBJECT_END, -1, null);

            if (title == null)
            {
                title = Extract(docBuf, HEADLINE, HEADLINE_END, -1, null);
            }
            if (title != null)
            {
                title = StripTags(title, 0).ToString().Trim();
            }

            docData.Clear();
            docData.Name = name;
            docData.SetDate(date);
            docData.Title = title;
            docData.Body  = StripTags(docBuf, mark).ToString();
            return(docData);
        }
        // TODO: we could take param to specify locale...
        //private readonly RuleBasedNumberFormat rnbf = new RuleBasedNumberFormat(Locale.ROOT,
        //                                                                     RuleBasedNumberFormat.SPELLOUT);
        public override DocData GetNextDocData(DocData docData)
        {
            UninterruptableMonitor.Enter(this);
            try
            {
                docData.Clear();
                // store the current counter to avoid synchronization later on
                long curCounter;
                UninterruptableMonitor.Enter(this); // LUCENENET TODO: Since the whole method is synchronized, do we need this?
                try
                {
                    curCounter = counter;
                    if (counter == long.MaxValue)
                    {
                        counter = long.MinValue;//loop around
                    }
                    else
                    {
                        ++counter;
                    }
                }
                finally
                {
                    UninterruptableMonitor.Exit(this);
                }

                docData.Body  = curCounter.ToWords(); //rnbf.format(curCounter);
                docData.Name  = "doc_" + curCounter.ToString(CultureInfo.InvariantCulture);
                docData.Title = "title_" + curCounter.ToString(CultureInfo.InvariantCulture);
                docData.SetDate(new DateTime());
                return(docData);
            }
            finally
            {
                UninterruptableMonitor.Exit(this);
            }
        }
Esempio n. 13
0
        public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc,
                                      StringBuilder docBuf, ParsePathType pathType)
        {
            int mark = 0; // that much is skipped

            // date...
            DateTime?date    = null;
            string   dateStr = Extract(docBuf, DATE, DATE_END, -1, null);

            if (dateStr != null)
            {
                date = trecSrc.ParseDate(dateStr);
            }

            // title...
            string title = Extract(docBuf, HEADLINE, HEADLINE_END, -1, null);

            docData.Clear();
            docData.Name = name;
            docData.SetDate(date);
            docData.Title = title;
            docData.Body  = StripTags(docBuf, mark).ToString();
            return(docData);
        }