public List<Document> get_docs(string file_path)
        {

            List <Document> my_batch = new List<Document>();
           
            int offset = 0;
           // string text = System.IO.File.ReadAllText(file_path);
            StringBuilder str = new StringBuilder(System.IO.File.ReadAllText(file_path));
            while (str.Length > 100)
            {
                int doc_idx_start2 = indexOf(str, "<DOC>");
                if (doc_idx_start2 == -1)
                    break;
                int doc_idx_end2 = indexOf(str, "</DOC>");
                string single_doc2 = str.ToString(doc_idx_start2 + 6, doc_idx_end2 - (doc_idx_start2 + 6));
                Document doc = new Document(single_doc2, offset, file_path);
                offset = doc_idx_end2 + 2;
                my_batch.Add(doc);
                string x = str.ToString();
                str.Remove(doc_idx_start2, doc_idx_end2 + 6 - doc_idx_start2);
                /*
                int doc_idx_start = text.IndexOf("<DOC>", 0);
                int doc_idx_end = text.IndexOf("</DOC>");
                string single_doc = text.Substring(doc_idx_start + 6, doc_idx_end - (doc_idx_start + 6));
                Document doc = new Document(single_doc,offset, file_path);
                offset = doc_idx_end + 2;
                my_batch.Add(doc);
                text = text.Substring(doc_idx_end + 6, text.Length - (doc_idx_end + 6));*/
            }

            return my_batch;
               
      }
Exemple #2
0
        public Dictionary<string,TermInDoc> parse_doc (Document doc) 
        {

            termDic = new Dictionary<string, TermInDoc>();
            doc_text = new StringBuilder(doc.text);
            doc_title = new StringBuilder(doc.title);
            doc_date = doc.date;
            doc_id = doc.id;
            batch_id = doc.batch_id;
            doc_offset = doc.doc_idx;
            dates_parse(doc_text.ToString());
            numbers_parse();
            replace_chars();
            names_parse();
            remove_stopwords_text();
            remove_stopwords_title();
            regular_words_parse_text();
            regular_words_parse_title();
      
            return termDic;

        }
Exemple #3
0
 public void SetDoc(int i,Document doc)
 {
     my_batch[i] = doc;
 }
Exemple #4
0
 public void AddDoc(Document doc)
 {
     my_batch.Add(doc);
 }
 public void insertDoc (Document doc)
 {
     docs_to_parse.Enqueue(doc);
 }
Exemple #6
0
 public parse_doc(Document doc)
 {
 }