//parses the given corpus and places it in the inverted index
        private void Parse( String DocumentPath )
        {
            InvertedIndex = new SuffixNode(' ');
            invertedIndexWatch.Reset();

            StringBuilder builder = new StringBuilder();
            Parser = new SgmlParser(DocumentPath);
            String value;
            String prevDocId = "";

            while ((value = Parser.Next()) != null)
            {
                if (!StopWordsIndex.HasWord(value) && !isNumber(value) )
                {
                    value = Stem(value);

                    int weight = (isCapital(value[0]))? 2 : 1;

                    invertedIndexWatch.Start();
                    InvertedIndex.Add(value, new DocumentIndex(Parser.DocID, 0),weight);
                    invertedIndexWatch.Stop();
                }

                if (!Documents.Contains(Parser.DocID))
                {
                    Documents[Parser.DocID] = new DocumentVector(Parser.DocID, Parser.HeadLine, Parser.DateLine,Parser.DocumentPosition);
                    DocumentLengths[Parser.DocID] = 0;
                }

                DocumentLengths[Parser.DocID] = ((int)DocumentLengths[Parser.DocID]) + 1;

                //fire an event out to any attatched methods
                if ( prevDocId != Parser.DocID && ParseIteration != null)
                    ParseIteration(this);

                prevDocId = Parser.DocID;
            }
            Parser.Close();
        }
        //perform feature weighting by traversing the inverted index
        private void TraverseInvertedIndex(SuffixNode Node, String word )
        {
            if (Node.LeafNode != null)
            {
                foreach (KeyValuePair<string,int> pair in Node.LeafNode.DocumentNodeList)
                {
                    DocumentVector document = (DocumentVector)Documents[pair.Key];
                    int index = Features.IndexOf(word);
                    document.Vector[index] = TFIDF( pair.Value,
                                                    Node.LeafNode.DocumentNodeList.Count,
                                                    (int) DocumentLengths[pair.Key],
                                                    Documents.Count     );  //tfidf weighting
                }
            }

            for (int i = 0; i < Node.Neighbours.Length; i++)
            {
                if (Node.Neighbours[i] != null)
                    TraverseInvertedIndex(Node.Neighbours[i], word + Node.Neighbours[i].Value );
            }
        }
        //builds a stopword index from the given items in stopwords.txt
        private void BuildStopWordsIndex()
        {
            StopWordsIndex = new SuffixNode(' ');

            FileStream stream = new FileStream("stop_words.txt", FileMode.Open);
            int nobytes;
            byte[] buffer = new byte[4096];
            StringBuilder StopWordStrings = new StringBuilder();
            int words = 0;

            while ((nobytes = stream.Read(buffer, 0, 4096)) > 0)
            {
                StopWordStrings.Append(Encoding.UTF8.GetString(buffer, 0, nobytes));
            }

            String word = "";
            String list = StopWordStrings.ToString();
            foreach (char c in list)
            {
                if (c == '\n')
                {
                    if (word.Length > 0)
                    {
                        StopWordsIndex.Add(word);
                        words++;
                        word = "";
                    }
                }
                else
                    if (SuffixNode.AcceptedCharacter(c))
                        word += c;
            }
            if (word.Length > 0) StopWordsIndex.Add(word);

            stream.Close();
        }