public static double[][] ReturnTFIDFVectors(List <Document> documents)
        {
            // generate list ordering of megadictionary
            List <string>         keysList    = MegaDictionary.ReturnKeysList();
            List <List <double> > TFIDVectors = new List <List <double> >();

            int counter = 1;

            foreach (var document in documents)
            {
                //Debug.WriteLine("TFIDF vector for document #: " + counter);
                List <double> documentVector = new List <double>();
                // calculate TFDIF vector for document
                foreach (var word in keysList)
                {
                    double tf   = document.UniqueWordsFreq() == 0 ? 0 : (double)document.ReturnFrequency(word) / document.UniqueWordsFreq(); // if document has 0 terms it it, return 0
                    double calc = documents.Count / MegaDictionary.ReturnTermFrequency(word);
                    double idf  = Math.Log(calc);
                    documentVector.Add(tf * idf);
                }

                TFIDVectors.Add(documentVector);
                counter++;
            }

            // change into double[][] and normalize
            double[][] vectors = TFIDVectors.Select(v => v.ToArray()).ToArray();
            Normalize(vectors);
            return(vectors);
        }
Exemple #2
0
        private async Task RunKMeans()
        {
            MegaDictionary mega = new MegaDictionary();

            List <string> fileNames = new List <string>()
            {
            };
            List <string> data = new List <string>();

            var allfiles = await ApplicationData.Current.LocalFolder.GetFilesAsync();

            Debug.WriteLine(ApplicationData.Current.LocalFolder.Path);

            string[] hi = new string[allfiles.Count];

            // get only documents
            //foreach (var file in allfiles)
            //{
            //    fileNames.Add(file.Name);
            //    data.Add(await FileIO.ReadTextAsync(file));
            //}
            var counter = 0;

            foreach (var storageFile in allfiles)
            {
                IBuffer buffer = await FileIO.ReadBufferAsync(storageFile);

                DataReader reader      = DataReader.FromBuffer(buffer);
                byte[]     fileContent = new byte[reader.UnconsumedBufferLength];
                reader.ReadBytes(fileContent);
                string text = Encoding.UTF8.GetString(fileContent, 0, fileContent.Length);

                hi[counter] = text;
                counter++;
                //data.Add(text);
                //fileNames.Add(storageFile.Name);
            }

            Debug.WriteLine(hi);


            //ClusterKMeansTestElkans KMeans = new ClusterKMeansTestElkans(20, data.ToArray(), fileNames.ToArray());
            //KMeans.calcTFIDFVectors();
            //KMeans.GenerateClustersWithK(30);

            //for (int i = 0; i < 10; i++)
            //{
            //    var watch = System.Diagnostics.Stopwatch.StartNew();
            //    KMeans.calcTFIDFVectors();
            //    KMeans.GenerateClustersWithK(5);
            //    watch.Stop();
            //    var elapsedMs = watch.ElapsedMilliseconds;
            //    Debug.WriteLine("Iteration " + i + " Took: " + elapsedMs + " ms");
            //}
        }
        public List <Document> parseMultipleDocs(List <string> docs, List <string> ids)
        {
            List <Document> documentList = new List <Document>();

            for (int i = 0; i < docs.Count; i++)
            {
                documentList.Add(parseDocument(docs[i], ""));
                Debug.WriteLine("Done with document: " + i);
            }

            MegaDictionary.CleanseDictionary();

            return(documentList);
        }
        private double aK()
        {
            if (_k < 2)
            {
                _previousAK = -1.0;
            }
            else if (_k == 2)
            {
                _previousAK = 1 - ((double)3 / (4 * MegaDictionary.ReturnKeysList().Count)); // set current aK to previous
            }
            else
            {
                _previousAK = _previousAK + ((1 - _previousAK) / 6);
            }

            Debug.Assert(!_previousAK.Equals(-1.0));
            return(_previousAK);
        }
        public Document parseDocument(string line, string id)
        {
            termFreqDict = new Dictionary <string, int>();

            line = line.ToLower();
            line = line.TrimEnd(' ');
            line = Regex.Replace(line, @"\t|\n|\r", "");

            Regex rgx = new Regex("[^a-z0-9 ]"); // keep just alphanumeric characters

            line = rgx.Replace(line, " ");

            line = Regex.Replace(line, string.Format(@"(\p{{L}}{{{0}}})\p{{L}}+", 11), ""); // remove 12 >
            line = Regex.Replace(line, @"\b\w{1,3}\b", "");                                 // remove words that have three letters or fewer
            line = Regex.Replace(line, @"\s+", " ");                                        // remove extra whitespace

            var noSpaces = line.Split(new String[] { " " }, StringSplitOptions.RemoveEmptyEntries);

            HashSet <string> uniqueWords = new HashSet <string>();

            Stemmer stemmer = new Stemmer();

            foreach (var s in noSpaces)
            {
                // stem words
                string word = stemmer.stem(s);
                if (!StopWords.stopWordsSet.Contains(word) && !word.Any(c => char.IsDigit(c)))
                {
                    addToLocalDict(word);

                    if (!uniqueWords.Contains(word))
                    {
                        MegaDictionary.AddToDictionary(word);
                        uniqueWords.Add(word);
                    }
                }
            }

            return(new Document(termFreqDict, id));
        }
        public static Dictionary <int, double>[] ReturnTFIDFDicts(List <Document> documents)
        {
            // generate list ordering of megadictionary
            List <string> keysList = MegaDictionary.ReturnKeysList();

            List <Dictionary <int, double> > TFIDFDictionaryList = new List <Dictionary <int, double> >();
            int counter = 1;

            foreach (var document in documents)
            {
                Debug.WriteLine("TFIDF vector for document #: " + counter);
                Dictionary <int, double> TFIDFDict = new Dictionary <int, double>();

                // calculate TFDIF vector for document
                for (int i = 0; i < keysList.Count; i++)
                {
                    string word  = keysList[i];
                    double tf    = document.UniqueWordsFreq() == 0 ? 0 : (double)document.ReturnFrequency(word) / document.UniqueWordsFreq(); // if document has 0 terms it it, return 0
                    double calc  = documents.Count / MegaDictionary.ReturnTermFrequency(word);
                    double idf   = Math.Log(calc);
                    double tfidf = tf * idf;

                    // only add to dictionary if tfidf is not 0
                    if (tfidf != 0)
                    {
                        TFIDFDict.Add(i, tfidf);
                    }
                }

                TFIDFDictionaryList.Add(TFIDFDict);
                counter++;
            }

            // change into array and normalize
            Dictionary <int, double>[] listOfDictionaries = TFIDFDictionaryList.ToArray();
            NormalizeDictionaryArray(listOfDictionaries);
            return(listOfDictionaries);
        }
        public static double[][] LshTfidf(List <Document> documents)
        {
            // new megakeyslist
            HashSet <string> megaKeysList = new HashSet <string>();

            int counter = 0;

            foreach (var document in documents)
            {
                List <string> keysList = document.ReturnKeysList();
                List <Tuple <string, double> > documentVector = new List <Tuple <string, double> >();

                Debug.WriteLine("Generating k largest tfidf words for document: " + counter);
                for (int i = 0; i < keysList.Count; i++)
                {
                    string word = keysList[i];
                    if (!MegaDictionary.ReturnTermFrequency(word).Equals(-1))
                    {
                        double tf   = document.UniqueWordsFreq() == 0 ? 0 : (double)document.ReturnFrequency(word) / document.UniqueWordsFreq(); // if document has 0 terms it it, return 0
                        double calc = documents.Count / MegaDictionary.ReturnTermFrequency(word);
                        double idf  = Math.Log(calc);
                        documentVector.Add(new Tuple <string, double>(word, tf * idf));
                    }
                }

                // change into array
                Tuple <string, double>[] docVectorArray = documentVector.ToArray();

                // for now, lets use top 50%
                int k = (int)(documentVector.Count * 0.5);
                Tuple <string, double> kthLargest = QuickSelect.quickselect(docVectorArray, k);

                foreach (Tuple <string, double> pair in documentVector)
                {
                    if (pair.Item2 >= kthLargest.Item2)
                    {
                        // add to keys list if tfidf is greater than kth largest
                        megaKeysList.Add(pair.Item1);
                    }
                }
                counter++;
            }

            // now megakeyslist contains only the top 50% tfidf words from each document, change into list to generate an ordering
            List <string> wordsList = megaKeysList.ToList();

            // [][] that will store all document vectors
            double[][] TFIDVectors = new double[documents.Count][];

            //loop through documents again and create vector for each
            for (int j = 0; j < documents.Count; j++)
            {
                Debug.WriteLine("Generating actual vectors for : " + j);
                double[] newDocumentVector = new double[wordsList.Count];

                for (int i = 0; i < wordsList.Count; i++)
                {
                    newDocumentVector[i] = documents[j].ReturnFrequency(wordsList[i]) == 0 ? 0 : 1;
                }
                TFIDVectors[j] = newDocumentVector;
            }

            return(TFIDVectors);
        }