예제 #1
0
        public virtual DocData Parse(DocData docData, string name, DateTime?date, InputSource source, TrecContentSource trecSrc)
        {
            Parser p = new Parser(source);

            // properties
            IDictionary <string, string> props = p.MetaTags;
            string dateStr;

            if (props.TryGetValue("date", out dateStr) && dateStr != null)
            {
                DateTime?newDate = trecSrc.ParseDate(dateStr);
                if (newDate != null)
                {
                    date = newDate;
                }
            }

            docData.Clear();
            docData.Name  = name;
            docData.Body  = p.Body;
            docData.Title = p.Title;
            docData.Props = props;
            docData.SetDate(date);
            return(docData);
        }
예제 #2
0
        //TODO can we also extract title for this format?

        public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc,
                                      StringBuilder docBuf, ParsePathType pathType)
        {
            int mark = 0; // that much is skipped
                          // optionally skip some of the text, set date (no title?)
            DateTime?date = null;
            int      h1   = docBuf.IndexOf(TEXT, StringComparison.Ordinal);

            if (h1 >= 0)
            {
                int h2 = docBuf.IndexOf(TEXT_END, h1, StringComparison.Ordinal);
                mark = h1 + TEXT_LENGTH;
                // date...
                string dateStr = Extract(docBuf, DATE, DATE_END, h2, DATE_NOISE_PREFIXES);
                if (dateStr != null)
                {
                    dateStr = StripTags(dateStr, 0).ToString();
                    date    = trecSrc.ParseDate(dateStr.Trim());
                }
            }
            docData.Clear();
            docData.Name = name;
            docData.SetDate(date);
            docData.Body = StripTags(docBuf, mark).ToString();
            return(docData);
        }
예제 #3
0
        public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc,
                                      StringBuilder docBuf, ParsePathType pathType)
        {
            int mark = 0; // that much is skipped
                          // optionally skip some of the text, set date, title
            DateTime?date  = null;
            string   title = null;
            int      h1    = docBuf.IndexOf(HEADER);

            if (h1 >= 0)
            {
                int h2 = docBuf.IndexOf(HEADER_END, h1);
                mark = h2 + HEADER_END_LENGTH;
                // date...
                string dateStr = Extract(docBuf, DATE1, DATE1_END, h2, null);
                if (dateStr != null)
                {
                    date = trecSrc.ParseDate(dateStr);
                }
                // title...
                title = Extract(docBuf, TI, TI_END, h2, null);
            }
            docData.Clear();
            docData.Name = name;
            docData.SetDate(date);
            docData.Title = title;
            docData.Body  = StripTags(docBuf, mark).ToString();
            return(docData);
        }
        // TODO: we could take param to specify locale...
        //private readonly RuleBasedNumberFormat rnbf = new RuleBasedNumberFormat(Locale.ROOT,
        //                                                                     RuleBasedNumberFormat.SPELLOUT);
        public override DocData GetNextDocData(DocData docData)
        {
            lock (this)
            {
                docData.Clear();
                // store the current counter to avoid synchronization later on
                long curCounter;
                lock (this)
                {
                    curCounter = counter;
                    if (counter == long.MaxValue)
                    {
                        counter = long.MinValue;//loop around
                    }
                    else
                    {
                        ++counter;
                    }
                }

                docData.Body  = curCounter.ToWords(); //rnbf.format(curCounter);
                docData.Name  = "doc_" + curCounter.ToString(CultureInfo.InvariantCulture);
                docData.Title = "title_" + curCounter.ToString(CultureInfo.InvariantCulture);
                docData.SetDate(new DateTime());
                return(docData);
            }
        }
예제 #5
0
 public override DocData GetNextDocData(DocData docData)
 {
     string[] tuple = parser.Next();
     docData.Clear();
     docData.Name = tuple[ID];
     docData.Body = tuple[BODY];
     docData.SetDate(tuple[DATE]);
     docData.Title = tuple[TITLE];
     return(docData);
 }
예제 #6
0
        public override DocData GetNextDocData(DocData docData)
        {
            int id = NewDocID();

            AddBytes(DOC_TEXT.Length);
            docData.Clear();
            docData.Name = "doc" + id;
            docData.Body = DOC_TEXT;
            return(docData);
        }
예제 #7
0
        public override DocData GetNextDocData(DocData docData)
        {
            FileInfo f    = null;
            string   name = null;

            UninterruptableMonitor.Enter(this);
            try
            {
                if (nextFile >= inputFiles.Count)
                {
                    // exhausted files, start a new round, unless forever set to false.
                    if (!m_forever)
                    {
                        throw new NoMoreDataException();
                    }
                    nextFile = 0;
                    iteration++;
                }
                f    = inputFiles[nextFile++];
                name = f.GetCanonicalPath() + "_" + iteration;
            }
            finally
            {
                UninterruptableMonitor.Exit(this);
            }

            using TextReader reader = new StreamReader(new FileStream(f.FullName, FileMode.Open, FileAccess.Read), Encoding.UTF8);
            // First line is the date, 3rd is the title, rest is body
            string dateStr = reader.ReadLine();

            reader.ReadLine();// skip an empty line
            string title = reader.ReadLine();

            reader.ReadLine();// skip an empty line
            StringBuilder bodyBuf = new StringBuilder(1024);
            string        line    = null;

            while ((line = reader.ReadLine()) != null)
            {
                bodyBuf.Append(line).Append(' ');
            }
            reader.Dispose();


            AddBytes(f.Length);

            DateTime?date = ParseDate(dateStr.Trim());

            docData.Clear();
            docData.Name  = name;
            docData.Body  = bodyBuf.ToString();
            docData.Title = title;
            docData.SetDate(date);
            return(docData);
        }
예제 #8
0
        public override DocData GetNextDocData(DocData docData)
        {
            FileInfo f    = null;
            string   name = null;

            lock (this)
            {
                if (!inputFiles.MoveNext())
                {
                    // exhausted files, start a new round, unless forever set to false.
                    if (!m_forever)
                    {
                        throw new NoMoreDataException();
                    }
                    inputFiles = new Enumerator(dataDir);
                    iteration++;
                }
                f = inputFiles.Current;
                // System.err.println(f);
                name = f.GetCanonicalPath() + "_" + iteration;
            }

            string        line = null;
            string        dateStr;
            string        title;
            StringBuilder bodyBuf = new StringBuilder(1024);

            using (TextReader reader = new StreamReader(new FileStream(f.FullName, FileMode.Open, FileAccess.Read), Encoding.UTF8))
            {
                //First line is the date, 3rd is the title, rest is body
                dateStr = reader.ReadLine();
                reader.ReadLine(); //skip an empty line
                title = reader.ReadLine();
                reader.ReadLine(); //skip an empty line
                while ((line = reader.ReadLine()) != null)
                {
                    bodyBuf.Append(line).Append(' ');
                }
            }
            AddBytes(f.Length);

            DateTime?date = ParseDate(dateStr);

            docData.Clear();
            docData.Name  = name;
            docData.Body  = bodyBuf.ToString();
            docData.Title = title;
            docData.SetDate(date);
            return(docData);
        }
예제 #9
0
        public override DocData GetNextDocData(DocData docData)
        {
            string line;
            int    myID;


            UninterruptableMonitor.Enter(this);
            try
            {
                line = reader.ReadLine();
                if (line is null)
                {
                    if (!m_forever)
                    {
                        throw new NoMoreDataException();
                    }
                    // Reset the file
                    OpenFile();
                    return(GetNextDocData(docData));
                }
                if (docDataLineReader is null)
                { // first line ever, one time initialization,
                    docDataLineReader = CreateDocDataLineReader(line);
                    if (skipHeaderLine)
                    {
                        return(GetNextDocData(docData));
                    }
                }
                // increment IDS only once...
                myID = readCount++;
            }
            finally
            {
                UninterruptableMonitor.Exit(this);
            }

            // The date String was written in the format of DateTools.dateToString.
            docData.Clear();
            docData.ID = myID;
            docDataLineReader.ParseLine(docData, line);
            return(docData);
        }
예제 #10
0
        public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc,
                                      StringBuilder docBuf, ParsePathType pathType)
        {
            int mark = 0; // that much is skipped

            // date...
            DateTime?date    = null;
            string   dateStr = Extract(docBuf, DATE, DATE_END, -1, null);

            if (dateStr != null)
            {
                int d2a = dateStr.IndexOf(DATE_NOISE, StringComparison.Ordinal);
                if (d2a > 0)
                {
                    dateStr = dateStr.Substring(0, (d2a + 3) - 0); // we need the "day" part
                }
                dateStr = StripTags(dateStr, 0).ToString();
                date    = trecSrc.ParseDate(dateStr.Trim());
            }

            // title... first try with SUBJECT, them with HEADLINE
            string title = Extract(docBuf, SUBJECT, SUBJECT_END, -1, null);

            if (title == null)
            {
                title = Extract(docBuf, HEADLINE, HEADLINE_END, -1, null);
            }
            if (title != null)
            {
                title = StripTags(title, 0).ToString().Trim();
            }

            docData.Clear();
            docData.Name = name;
            docData.SetDate(date);
            docData.Title = title;
            docData.Body  = StripTags(docBuf, mark).ToString();
            return(docData);
        }
예제 #11
0
        // TODO: we could take param to specify locale...
        //private readonly RuleBasedNumberFormat rnbf = new RuleBasedNumberFormat(Locale.ROOT,
        //                                                                     RuleBasedNumberFormat.SPELLOUT);
        public override DocData GetNextDocData(DocData docData)
        {
            UninterruptableMonitor.Enter(this);
            try
            {
                docData.Clear();
                // store the current counter to avoid synchronization later on
                long curCounter;
                UninterruptableMonitor.Enter(this); // LUCENENET TODO: Since the whole method is synchronized, do we need this?
                try
                {
                    curCounter = counter;
                    if (counter == long.MaxValue)
                    {
                        counter = long.MinValue;//loop around
                    }
                    else
                    {
                        ++counter;
                    }
                }
                finally
                {
                    UninterruptableMonitor.Exit(this);
                }

                docData.Body  = curCounter.ToWords(); //rnbf.format(curCounter);
                docData.Name  = "doc_" + curCounter.ToString(CultureInfo.InvariantCulture);
                docData.Title = "title_" + curCounter.ToString(CultureInfo.InvariantCulture);
                docData.SetDate(new DateTime());
                return(docData);
            }
            finally
            {
                UninterruptableMonitor.Exit(this);
            }
        }
예제 #12
0
        public override DocData Parse(DocData docData, string name, TrecContentSource trecSrc,
                                      StringBuilder docBuf, ParsePathType pathType)
        {
            int mark = 0; // that much is skipped

            // date...
            DateTime?date    = null;
            string   dateStr = Extract(docBuf, DATE, DATE_END, -1, null);

            if (dateStr != null)
            {
                date = trecSrc.ParseDate(dateStr);
            }

            // title...
            string title = Extract(docBuf, HEADLINE, HEADLINE_END, -1, null);

            docData.Clear();
            docData.Name = name;
            docData.SetDate(date);
            docData.Title = title;
            docData.Body  = StripTags(docBuf, mark).ToString();
            return(docData);
        }