Exemplo n.º 1
        public static double[][] ReturnTFIDFVectors(List <Document> documents)
            // generate list ordering of megadictionary
            List <string>         keysList    = MegaDictionary.ReturnKeysList();
            List <List <double> > TFIDVectors = new List <List <double> >();

            int counter = 1;

            foreach (var document in documents)
                //Debug.WriteLine("TFIDF vector for document #: " + counter);
                List <double> documentVector = new List <double>();
                // calculate TFDIF vector for document
                foreach (var word in keysList)
                    double tf   = document.UniqueWordsFreq() == 0 ? 0 : (double)document.ReturnFrequency(word) / document.UniqueWordsFreq(); // if document has 0 terms it it, return 0
                    double calc = documents.Count / MegaDictionary.ReturnTermFrequency(word);
                    double idf  = Math.Log(calc);
                    documentVector.Add(tf * idf);


            // change into double[][] and normalize
            double[][] vectors = TFIDVectors.Select(v => v.ToArray()).ToArray();
Exemplo n.º 2
        private async Task RunKMeans()
            MegaDictionary mega = new MegaDictionary();

            List <string> fileNames = new List <string>()
            List <string> data = new List <string>();

            var allfiles = await ApplicationData.Current.LocalFolder.GetFilesAsync();


            string[] hi = new string[allfiles.Count];

            // get only documents
            //foreach (var file in allfiles)
            //    fileNames.Add(file.Name);
            //    data.Add(await FileIO.ReadTextAsync(file));
            var counter = 0;

            foreach (var storageFile in allfiles)
                IBuffer buffer = await FileIO.ReadBufferAsync(storageFile);

                DataReader reader      = DataReader.FromBuffer(buffer);
                byte[]     fileContent = new byte[reader.UnconsumedBufferLength];
                string text = Encoding.UTF8.GetString(fileContent, 0, fileContent.Length);

                hi[counter] = text;


            //ClusterKMeansTestElkans KMeans = new ClusterKMeansTestElkans(20, data.ToArray(), fileNames.ToArray());

            //for (int i = 0; i < 10; i++)
            //    var watch = System.Diagnostics.Stopwatch.StartNew();
            //    KMeans.calcTFIDFVectors();
            //    KMeans.GenerateClustersWithK(5);
            //    watch.Stop();
            //    var elapsedMs = watch.ElapsedMilliseconds;
            //    Debug.WriteLine("Iteration " + i + " Took: " + elapsedMs + " ms");
Exemplo n.º 3
        public List <Document> parseMultipleDocs(List <string> docs, List <string> ids)
            List <Document> documentList = new List <Document>();

            for (int i = 0; i < docs.Count; i++)
                documentList.Add(parseDocument(docs[i], ""));
                Debug.WriteLine("Done with document: " + i);


        private double aK()
            if (_k < 2)
                _previousAK = -1.0;
            else if (_k == 2)
                _previousAK = 1 - ((double)3 / (4 * MegaDictionary.ReturnKeysList().Count)); // set current aK to previous
                _previousAK = _previousAK + ((1 - _previousAK) / 6);

Exemplo n.º 5
        public Document parseDocument(string line, string id)
            termFreqDict = new Dictionary <string, int>();

            line = line.ToLower();
            line = line.TrimEnd(' ');
            line = Regex.Replace(line, @"\t|\n|\r", "");

            Regex rgx = new Regex("[^a-z0-9 ]"); // keep just alphanumeric characters

            line = rgx.Replace(line, " ");

            line = Regex.Replace(line, string.Format(@"(\p{{L}}{{{0}}})\p{{L}}+", 11), ""); // remove 12 >
            line = Regex.Replace(line, @"\b\w{1,3}\b", "");                                 // remove words that have three letters or fewer
            line = Regex.Replace(line, @"\s+", " ");                                        // remove extra whitespace

            var noSpaces = line.Split(new String[] { " " }, StringSplitOptions.RemoveEmptyEntries);

            HashSet <string> uniqueWords = new HashSet <string>();

            Stemmer stemmer = new Stemmer();

            foreach (var s in noSpaces)
                // stem words
                string word = stemmer.stem(s);
                if (!StopWords.stopWordsSet.Contains(word) && !word.Any(c => char.IsDigit(c)))

                    if (!uniqueWords.Contains(word))

            return(new Document(termFreqDict, id));
Exemplo n.º 6
        public static Dictionary <int, double>[] ReturnTFIDFDicts(List <Document> documents)
            // generate list ordering of megadictionary
            List <string> keysList = MegaDictionary.ReturnKeysList();

            List <Dictionary <int, double> > TFIDFDictionaryList = new List <Dictionary <int, double> >();
            int counter = 1;

            foreach (var document in documents)
                Debug.WriteLine("TFIDF vector for document #: " + counter);
                Dictionary <int, double> TFIDFDict = new Dictionary <int, double>();

                // calculate TFDIF vector for document
                for (int i = 0; i < keysList.Count; i++)
                    string word  = keysList[i];
                    double tf    = document.UniqueWordsFreq() == 0 ? 0 : (double)document.ReturnFrequency(word) / document.UniqueWordsFreq(); // if document has 0 terms it it, return 0
                    double calc  = documents.Count / MegaDictionary.ReturnTermFrequency(word);
                    double idf   = Math.Log(calc);
                    double tfidf = tf * idf;

                    // only add to dictionary if tfidf is not 0
                    if (tfidf != 0)
                        TFIDFDict.Add(i, tfidf);


            // change into array and normalize
            Dictionary <int, double>[] listOfDictionaries = TFIDFDictionaryList.ToArray();
Exemplo n.º 7
        public static double[][] LshTfidf(List <Document> documents)
            // new megakeyslist
            HashSet <string> megaKeysList = new HashSet <string>();

            int counter = 0;

            foreach (var document in documents)
                List <string> keysList = document.ReturnKeysList();
                List <Tuple <string, double> > documentVector = new List <Tuple <string, double> >();

                Debug.WriteLine("Generating k largest tfidf words for document: " + counter);
                for (int i = 0; i < keysList.Count; i++)
                    string word = keysList[i];
                    if (!MegaDictionary.ReturnTermFrequency(word).Equals(-1))
                        double tf   = document.UniqueWordsFreq() == 0 ? 0 : (double)document.ReturnFrequency(word) / document.UniqueWordsFreq(); // if document has 0 terms it it, return 0
                        double calc = documents.Count / MegaDictionary.ReturnTermFrequency(word);
                        double idf  = Math.Log(calc);
                        documentVector.Add(new Tuple <string, double>(word, tf * idf));

                // change into array
                Tuple <string, double>[] docVectorArray = documentVector.ToArray();

                // for now, lets use top 50%
                int k = (int)(documentVector.Count * 0.5);
                Tuple <string, double> kthLargest = QuickSelect.quickselect(docVectorArray, k);

                foreach (Tuple <string, double> pair in documentVector)
                    if (pair.Item2 >= kthLargest.Item2)
                        // add to keys list if tfidf is greater than kth largest

            // now megakeyslist contains only the top 50% tfidf words from each document, change into list to generate an ordering
            List <string> wordsList = megaKeysList.ToList();

            // [][] that will store all document vectors
            double[][] TFIDVectors = new double[documents.Count][];

            //loop through documents again and create vector for each
            for (int j = 0; j < documents.Count; j++)
                Debug.WriteLine("Generating actual vectors for : " + j);
                double[] newDocumentVector = new double[wordsList.Count];

                for (int i = 0; i < wordsList.Count; i++)
                    newDocumentVector[i] = documents[j].ReturnFrequency(wordsList[i]) == 0 ? 0 : 1;
                TFIDVectors[j] = newDocumentVector;
