コード例 #1
0
        //parses the given corpus and places it in the inverted index
        private void Parse( String DocumentPath )
        {
            InvertedIndex = new SuffixNode(' ');
            invertedIndexWatch.Reset();

            StringBuilder builder = new StringBuilder();
            Parser = new SgmlParser(DocumentPath);
            String value;
            String prevDocId = "";

            while ((value = Parser.Next()) != null)
            {
                if (!StopWordsIndex.HasWord(value) && !isNumber(value) )
                {
                    value = Stem(value);

                    int weight = (isCapital(value[0]))? 2 : 1;

                    invertedIndexWatch.Start();
                    InvertedIndex.Add(value, new DocumentIndex(Parser.DocID, 0),weight);
                    invertedIndexWatch.Stop();
                }

                if (!Documents.Contains(Parser.DocID))
                {
                    Documents[Parser.DocID] = new DocumentVector(Parser.DocID, Parser.HeadLine, Parser.DateLine,Parser.DocumentPosition);
                    DocumentLengths[Parser.DocID] = 0;
                }

                DocumentLengths[Parser.DocID] = ((int)DocumentLengths[Parser.DocID]) + 1;

                //fire an event out to any attatched methods
                if ( prevDocId != Parser.DocID && ParseIteration != null)
                    ParseIteration(this);

                prevDocId = Parser.DocID;
            }
            Parser.Close();
        }
コード例 #2
0
        //builds a stopword index from the given items in stopwords.txt
        private void BuildStopWordsIndex()
        {
            StopWordsIndex = new SuffixNode(' ');

            FileStream stream = new FileStream("stop_words.txt", FileMode.Open);
            int nobytes;
            byte[] buffer = new byte[4096];
            StringBuilder StopWordStrings = new StringBuilder();
            int words = 0;

            while ((nobytes = stream.Read(buffer, 0, 4096)) > 0)
            {
                StopWordStrings.Append(Encoding.UTF8.GetString(buffer, 0, nobytes));
            }

            String word = "";
            String list = StopWordStrings.ToString();
            foreach (char c in list)
            {
                if (c == '\n')
                {
                    if (word.Length > 0)
                    {
                        StopWordsIndex.Add(word);
                        words++;
                        word = "";
                    }
                }
                else
                    if (SuffixNode.AcceptedCharacter(c))
                        word += c;
            }
            if (word.Length > 0) StopWordsIndex.Add(word);

            stream.Close();
        }