//private static Regex r = new Regex("([ \\t{}()\",:;. \n])"); private static List<DocVector> collectionProcessing(DocCollection collection, int M) { Dictionary<string, int> globalTerms = new Dictionary<string, int>(); /* * Remove stopwords * Stemming * Chose M keywords * Compute tf-idf */ // Chose global keywords base on document frequency value foreach (string documentContent in collection.RawDocs) { // Remove stopwords string[] words = StopwordRemoval.removeStopword(documentContent); // Stemming words = stemmer.stem_list(words); // Construct a set of distinct terms of current document HashSet<string> distinctTerms = new HashSet<string>(); foreach (string word in words) distinctTerms.Add(word); // Update represented value of document frequency for found distinct terms foreach (string term in distinctTerms) { if (globalTerms.ContainsKey(term)) globalTerms[term]++; else globalTerms.Add(term, 1); } } // Sort globalTerms dictionary and use the top M pair as keywords var keywords_df = from pair in globalTerms orderby Math.Abs(pair.Value - collection.RawDocs.Count / 5) ascending select pair; List<DocVector> docVectorSpace = new List<DocVector>(); float[] space = new float[M]; foreach (string document in collection.RawDocs) { // Calculate tf-idf int topRange = M * 5; HashSet<int> indices = new HashSet<int>(); Random r = new Random((int)DateTime.Now.Ticks & 0x0000FFFF); do { int pos = r.Next(0, topRange); indices.Add(pos); } while (indices.Count < M); DocVector _docVector = new DocVector(M); int i = 0; foreach (int pos in indices) { KeyValuePair<string, int> pair = keywords_df.ElementAt(pos); _docVector.Tf_idf[i] = (float)(Math.Log(collection.RawDocs.Count * 1.0 / pair.Value) * calc_tf(document, pair.Key)); ++i; } //DocVector _docVector = new DocVector(M); //int i = 0; //foreach (KeyValuePair<string, int> pair in keywords_df) //{ // _docVector.Tf_idf[i] = (float)(Math.Log(collection.RawDocs.Count * 1.0 / pair.Value) * calc_tf(document, pair.Key)); // ++i; // if (i >= M) // break; //} docVectorSpace.Add(_docVector); } // Store keywords keywords.Clear(); foreach (KeyValuePair<string, int> pair in keywords_df) keywords.Add(pair.Key); // Return vector space of whole collection return docVectorSpace; }
private DocVector vector; // Feature of item #endregion Fields #region Constructors public Item(int M) { vector = new DocVector(M); label = -1; tmpLabel = -1; }