public List<Document> get_docs(string file_path) { List <Document> my_batch = new List<Document>(); int offset = 0; // string text = System.IO.File.ReadAllText(file_path); StringBuilder str = new StringBuilder(System.IO.File.ReadAllText(file_path)); while (str.Length > 100) { int doc_idx_start2 = indexOf(str, "<DOC>"); if (doc_idx_start2 == -1) break; int doc_idx_end2 = indexOf(str, "</DOC>"); string single_doc2 = str.ToString(doc_idx_start2 + 6, doc_idx_end2 - (doc_idx_start2 + 6)); Document doc = new Document(single_doc2, offset, file_path); offset = doc_idx_end2 + 2; my_batch.Add(doc); string x = str.ToString(); str.Remove(doc_idx_start2, doc_idx_end2 + 6 - doc_idx_start2); /* int doc_idx_start = text.IndexOf("<DOC>", 0); int doc_idx_end = text.IndexOf("</DOC>"); string single_doc = text.Substring(doc_idx_start + 6, doc_idx_end - (doc_idx_start + 6)); Document doc = new Document(single_doc,offset, file_path); offset = doc_idx_end + 2; my_batch.Add(doc); text = text.Substring(doc_idx_end + 6, text.Length - (doc_idx_end + 6));*/ } return my_batch; }
public Dictionary<string,TermInDoc> parse_doc (Document doc) { termDic = new Dictionary<string, TermInDoc>(); doc_text = new StringBuilder(doc.text); doc_title = new StringBuilder(doc.title); doc_date = doc.date; doc_id = doc.id; batch_id = doc.batch_id; doc_offset = doc.doc_idx; dates_parse(doc_text.ToString()); numbers_parse(); replace_chars(); names_parse(); remove_stopwords_text(); remove_stopwords_title(); regular_words_parse_text(); regular_words_parse_title(); return termDic; }
public void SetDoc(int i,Document doc) { my_batch[i] = doc; }
public void AddDoc(Document doc) { my_batch.Add(doc); }
public void insertDoc (Document doc) { docs_to_parse.Enqueue(doc); }
public parse_doc(Document doc) { }