public DocumentWindow( LabeledDocumentVector Document, String CorpusPath, List<String> Features ) { InitializeComponent(); Parser = new SgmlParser(CorpusPath); Parser.FilePosition = Document.Document.Location; this.Features = Features; HeadLine = Document.Document.HeadLine; DateLine = Document.Document.DateLine; Id = Document.Document.Id; StringBuilder builder = new StringBuilder(); String Value; while ( (Value=Parser.NextParagraph()) != null ) { if (Parser.DocID != Id) break; builder.Append(Value); } Parser.Close(); DocumentContent = builder.ToString(); FeatureWeights = new Dictionary<string, double>(); for (int i = 0; i < Features.Count; i++) FeatureWeights.Add(Features[i], Document.Document.Vector[i]); VectorDataListView.ItemsSource = FeatureWeights; this.Title = Document.Document.Id + " Details"; }
//parses the given corpus and places it in the inverted index private void Parse( String DocumentPath ) { InvertedIndex = new SuffixNode(' '); invertedIndexWatch.Reset(); StringBuilder builder = new StringBuilder(); Parser = new SgmlParser(DocumentPath); String value; String prevDocId = ""; while ((value = Parser.Next()) != null) { if (!StopWordsIndex.HasWord(value) && !isNumber(value) ) { value = Stem(value); int weight = (isCapital(value[0]))? 2 : 1; invertedIndexWatch.Start(); InvertedIndex.Add(value, new DocumentIndex(Parser.DocID, 0),weight); invertedIndexWatch.Stop(); } if (!Documents.Contains(Parser.DocID)) { Documents[Parser.DocID] = new DocumentVector(Parser.DocID, Parser.HeadLine, Parser.DateLine,Parser.DocumentPosition); DocumentLengths[Parser.DocID] = 0; } DocumentLengths[Parser.DocID] = ((int)DocumentLengths[Parser.DocID]) + 1; //fire an event out to any attatched methods if ( prevDocId != Parser.DocID && ParseIteration != null) ParseIteration(this); prevDocId = Parser.DocID; } Parser.Close(); }