public DocumentWindow( LabeledDocumentVector Document, String CorpusPath, List<String> Features ) { InitializeComponent(); Parser = new SgmlParser(CorpusPath); Parser.FilePosition = Document.Document.Location; this.Features = Features; HeadLine = Document.Document.HeadLine; DateLine = Document.Document.DateLine; Id = Document.Document.Id; StringBuilder builder = new StringBuilder(); String Value; while ( (Value=Parser.NextParagraph()) != null ) { if (Parser.DocID != Id) break; builder.Append(Value); } Parser.Close(); DocumentContent = builder.ToString(); FeatureWeights = new Dictionary<string, double>(); for (int i = 0; i < Features.Count; i++) FeatureWeights.Add(Features[i], Document.Document.Vector[i]); VectorDataListView.ItemsSource = FeatureWeights; this.Title = Document.Document.Id + " Details"; }
//train the documents with a given format (see documentation) private List<LabeledDocumentVector> LoadTrainingData(String TrainingPath) { if (!Directory.Exists(TrainingPath)) return null; List<LabeledDocumentVector> TrainingData = new List<LabeledDocumentVector>(); DirectoryInfo info = new DirectoryInfo(TrainingPath); FileInfo[] files = info.GetFiles(); foreach (FileInfo file in files) { StreamReader reader = new StreamReader(file.FullName); String id; while ((id = reader.ReadLine()) != null) { LabeledDocumentVector Document = new LabeledDocumentVector( (DocumentVector)DocClassifier.Documents[id], file.Name.Substring(0,file.Name.Length-4) ); //remove the file extension TrainingData.Add(Document); } reader.Close(); } return TrainingData; }