Ejemplo n.º 1
0
        // Extract and store features
        public static void extractFeatures(string folderPath, int M, string output = "articles.feat")
        {
            DocCollection collection = new DocCollection();

            // Collect all files
            foreach (string filepath in Directory.GetFiles(folderPath, "*.txt"))
            {
                string content = File.ReadAllText(filepath);
                collection.collect(content);
            }

            List<DocVector> docVect = collectionProcessing(collection, 20);

            // Store features
            using (StreamWriter writer = new StreamWriter(output, false))
            {
                writer.WriteLine(Directory.GetFiles(folderPath, "*.txt").Count()); // Number of files
                writer.WriteLine(M); // Number of keywords

                // Store bag of words
                for (int i = 0; i < M; ++i)
                    writer.Write(keywords[i] + " ");
                writer.WriteLine();

                // Store feature for each document
                foreach (DocVector vector in docVect)
                {
                    for (int i = 0; i < M; ++i)
                        writer.Write(vector.Tf_idf[i] + " ");
                    writer.WriteLine();
                }
            }
        }
Ejemplo n.º 2
0
        //private static Regex r = new Regex("([ \\t{}()\",:;. \n])");
        private static List<DocVector> collectionProcessing(DocCollection collection, int M)
        {
            Dictionary<string, int> globalTerms = new Dictionary<string, int>();

            /*
             * Remove stopwords
             * Stemming
             * Chose M keywords
             * Compute tf-idf
             */

            // Chose global keywords base on document frequency value
            foreach (string documentContent in collection.RawDocs)
            {
                // Remove stopwords
                string[] words = StopwordRemoval.removeStopword(documentContent);
                // Stemming
                words = stemmer.stem_list(words);

                // Construct a set of distinct terms of current document
                HashSet<string> distinctTerms = new HashSet<string>();
                foreach (string word in words)
                    distinctTerms.Add(word);

                // Update represented value of document frequency for found distinct terms
                foreach (string term in distinctTerms)
                {
                    if (globalTerms.ContainsKey(term))
                        globalTerms[term]++;
                    else
                        globalTerms.Add(term, 1);
                }
            }

            // Sort globalTerms dictionary and use the top M pair as keywords
            var keywords_df = from pair in globalTerms
                              orderby Math.Abs(pair.Value - collection.RawDocs.Count / 5) ascending
                              select pair;

            List<DocVector> docVectorSpace = new List<DocVector>();

            float[] space = new float[M];
            foreach (string document in collection.RawDocs)
            {
                // Calculate tf-idf
                int topRange = M * 5;
                HashSet<int> indices = new HashSet<int>();
                Random r = new Random((int)DateTime.Now.Ticks & 0x0000FFFF);

                do
                {
                    int pos = r.Next(0, topRange);
                    indices.Add(pos);
                } while (indices.Count < M);

                DocVector _docVector = new DocVector(M);
                int i = 0;
                foreach (int pos in indices)
                {
                    KeyValuePair<string, int> pair = keywords_df.ElementAt(pos);
                    _docVector.Tf_idf[i] = (float)(Math.Log(collection.RawDocs.Count * 1.0 / pair.Value) * calc_tf(document, pair.Key));
                    ++i;
                }

                //DocVector _docVector = new DocVector(M);
                //int i = 0;
                //foreach (KeyValuePair<string, int> pair in keywords_df)
                //{
                //    _docVector.Tf_idf[i] = (float)(Math.Log(collection.RawDocs.Count * 1.0 / pair.Value) * calc_tf(document, pair.Key));
                //    ++i;
                //    if (i >= M)
                //        break;
                //}
                docVectorSpace.Add(_docVector);
            }

            // Store keywords
            keywords.Clear();
            foreach (KeyValuePair<string, int> pair in keywords_df)
                keywords.Add(pair.Key);

            // Return vector space of whole collection
            return docVectorSpace;
        }