//parses the given corpus and places it in the inverted index private void Parse( String DocumentPath ) { InvertedIndex = new SuffixNode(' '); invertedIndexWatch.Reset(); StringBuilder builder = new StringBuilder(); Parser = new SgmlParser(DocumentPath); String value; String prevDocId = ""; while ((value = Parser.Next()) != null) { if (!StopWordsIndex.HasWord(value) && !isNumber(value) ) { value = Stem(value); int weight = (isCapital(value[0]))? 2 : 1; invertedIndexWatch.Start(); InvertedIndex.Add(value, new DocumentIndex(Parser.DocID, 0),weight); invertedIndexWatch.Stop(); } if (!Documents.Contains(Parser.DocID)) { Documents[Parser.DocID] = new DocumentVector(Parser.DocID, Parser.HeadLine, Parser.DateLine,Parser.DocumentPosition); DocumentLengths[Parser.DocID] = 0; } DocumentLengths[Parser.DocID] = ((int)DocumentLengths[Parser.DocID]) + 1; //fire an event out to any attatched methods if ( prevDocId != Parser.DocID && ParseIteration != null) ParseIteration(this); prevDocId = Parser.DocID; } Parser.Close(); }
//perform feature weighting by traversing the inverted index private void TraverseInvertedIndex(SuffixNode Node, String word ) { if (Node.LeafNode != null) { foreach (KeyValuePair<string,int> pair in Node.LeafNode.DocumentNodeList) { DocumentVector document = (DocumentVector)Documents[pair.Key]; int index = Features.IndexOf(word); document.Vector[index] = TFIDF( pair.Value, Node.LeafNode.DocumentNodeList.Count, (int) DocumentLengths[pair.Key], Documents.Count ); //tfidf weighting } } for (int i = 0; i < Node.Neighbours.Length; i++) { if (Node.Neighbours[i] != null) TraverseInvertedIndex(Node.Neighbours[i], word + Node.Neighbours[i].Value ); } }
//builds a stopword index from the given items in stopwords.txt private void BuildStopWordsIndex() { StopWordsIndex = new SuffixNode(' '); FileStream stream = new FileStream("stop_words.txt", FileMode.Open); int nobytes; byte[] buffer = new byte[4096]; StringBuilder StopWordStrings = new StringBuilder(); int words = 0; while ((nobytes = stream.Read(buffer, 0, 4096)) > 0) { StopWordStrings.Append(Encoding.UTF8.GetString(buffer, 0, nobytes)); } String word = ""; String list = StopWordStrings.ToString(); foreach (char c in list) { if (c == '\n') { if (word.Length > 0) { StopWordsIndex.Add(word); words++; word = ""; } } else if (SuffixNode.AcceptedCharacter(c)) word += c; } if (word.Length > 0) StopWordsIndex.Add(word); stream.Close(); }