public static double[][] ReturnTFIDFVectors(List <Document> documents) { // generate list ordering of megadictionary List <string> keysList = MegaDictionary.ReturnKeysList(); List <List <double> > TFIDVectors = new List <List <double> >(); int counter = 1; foreach (var document in documents) { //Debug.WriteLine("TFIDF vector for document #: " + counter); List <double> documentVector = new List <double>(); // calculate TFDIF vector for document foreach (var word in keysList) { double tf = document.UniqueWordsFreq() == 0 ? 0 : (double)document.ReturnFrequency(word) / document.UniqueWordsFreq(); // if document has 0 terms it it, return 0 double calc = documents.Count / MegaDictionary.ReturnTermFrequency(word); double idf = Math.Log(calc); documentVector.Add(tf * idf); } TFIDVectors.Add(documentVector); counter++; } // change into double[][] and normalize double[][] vectors = TFIDVectors.Select(v => v.ToArray()).ToArray(); Normalize(vectors); return(vectors); }
private async Task RunKMeans() { MegaDictionary mega = new MegaDictionary(); List <string> fileNames = new List <string>() { }; List <string> data = new List <string>(); var allfiles = await ApplicationData.Current.LocalFolder.GetFilesAsync(); Debug.WriteLine(ApplicationData.Current.LocalFolder.Path); string[] hi = new string[allfiles.Count]; // get only documents //foreach (var file in allfiles) //{ // fileNames.Add(file.Name); // data.Add(await FileIO.ReadTextAsync(file)); //} var counter = 0; foreach (var storageFile in allfiles) { IBuffer buffer = await FileIO.ReadBufferAsync(storageFile); DataReader reader = DataReader.FromBuffer(buffer); byte[] fileContent = new byte[reader.UnconsumedBufferLength]; reader.ReadBytes(fileContent); string text = Encoding.UTF8.GetString(fileContent, 0, fileContent.Length); hi[counter] = text; counter++; //data.Add(text); //fileNames.Add(storageFile.Name); } Debug.WriteLine(hi); //ClusterKMeansTestElkans KMeans = new ClusterKMeansTestElkans(20, data.ToArray(), fileNames.ToArray()); //KMeans.calcTFIDFVectors(); //KMeans.GenerateClustersWithK(30); //for (int i = 0; i < 10; i++) //{ // var watch = System.Diagnostics.Stopwatch.StartNew(); // KMeans.calcTFIDFVectors(); // KMeans.GenerateClustersWithK(5); // watch.Stop(); // var elapsedMs = watch.ElapsedMilliseconds; // Debug.WriteLine("Iteration " + i + " Took: " + elapsedMs + " ms"); //} }
public List <Document> parseMultipleDocs(List <string> docs, List <string> ids) { List <Document> documentList = new List <Document>(); for (int i = 0; i < docs.Count; i++) { documentList.Add(parseDocument(docs[i], "")); Debug.WriteLine("Done with document: " + i); } MegaDictionary.CleanseDictionary(); return(documentList); }
private double aK() { if (_k < 2) { _previousAK = -1.0; } else if (_k == 2) { _previousAK = 1 - ((double)3 / (4 * MegaDictionary.ReturnKeysList().Count)); // set current aK to previous } else { _previousAK = _previousAK + ((1 - _previousAK) / 6); } Debug.Assert(!_previousAK.Equals(-1.0)); return(_previousAK); }
public Document parseDocument(string line, string id) { termFreqDict = new Dictionary <string, int>(); line = line.ToLower(); line = line.TrimEnd(' '); line = Regex.Replace(line, @"\t|\n|\r", ""); Regex rgx = new Regex("[^a-z0-9 ]"); // keep just alphanumeric characters line = rgx.Replace(line, " "); line = Regex.Replace(line, string.Format(@"(\p{{L}}{{{0}}})\p{{L}}+", 11), ""); // remove 12 > line = Regex.Replace(line, @"\b\w{1,3}\b", ""); // remove words that have three letters or fewer line = Regex.Replace(line, @"\s+", " "); // remove extra whitespace var noSpaces = line.Split(new String[] { " " }, StringSplitOptions.RemoveEmptyEntries); HashSet <string> uniqueWords = new HashSet <string>(); Stemmer stemmer = new Stemmer(); foreach (var s in noSpaces) { // stem words string word = stemmer.stem(s); if (!StopWords.stopWordsSet.Contains(word) && !word.Any(c => char.IsDigit(c))) { addToLocalDict(word); if (!uniqueWords.Contains(word)) { MegaDictionary.AddToDictionary(word); uniqueWords.Add(word); } } } return(new Document(termFreqDict, id)); }
public static Dictionary <int, double>[] ReturnTFIDFDicts(List <Document> documents) { // generate list ordering of megadictionary List <string> keysList = MegaDictionary.ReturnKeysList(); List <Dictionary <int, double> > TFIDFDictionaryList = new List <Dictionary <int, double> >(); int counter = 1; foreach (var document in documents) { Debug.WriteLine("TFIDF vector for document #: " + counter); Dictionary <int, double> TFIDFDict = new Dictionary <int, double>(); // calculate TFDIF vector for document for (int i = 0; i < keysList.Count; i++) { string word = keysList[i]; double tf = document.UniqueWordsFreq() == 0 ? 0 : (double)document.ReturnFrequency(word) / document.UniqueWordsFreq(); // if document has 0 terms it it, return 0 double calc = documents.Count / MegaDictionary.ReturnTermFrequency(word); double idf = Math.Log(calc); double tfidf = tf * idf; // only add to dictionary if tfidf is not 0 if (tfidf != 0) { TFIDFDict.Add(i, tfidf); } } TFIDFDictionaryList.Add(TFIDFDict); counter++; } // change into array and normalize Dictionary <int, double>[] listOfDictionaries = TFIDFDictionaryList.ToArray(); NormalizeDictionaryArray(listOfDictionaries); return(listOfDictionaries); }
public static double[][] LshTfidf(List <Document> documents) { // new megakeyslist HashSet <string> megaKeysList = new HashSet <string>(); int counter = 0; foreach (var document in documents) { List <string> keysList = document.ReturnKeysList(); List <Tuple <string, double> > documentVector = new List <Tuple <string, double> >(); Debug.WriteLine("Generating k largest tfidf words for document: " + counter); for (int i = 0; i < keysList.Count; i++) { string word = keysList[i]; if (!MegaDictionary.ReturnTermFrequency(word).Equals(-1)) { double tf = document.UniqueWordsFreq() == 0 ? 0 : (double)document.ReturnFrequency(word) / document.UniqueWordsFreq(); // if document has 0 terms it it, return 0 double calc = documents.Count / MegaDictionary.ReturnTermFrequency(word); double idf = Math.Log(calc); documentVector.Add(new Tuple <string, double>(word, tf * idf)); } } // change into array Tuple <string, double>[] docVectorArray = documentVector.ToArray(); // for now, lets use top 50% int k = (int)(documentVector.Count * 0.5); Tuple <string, double> kthLargest = QuickSelect.quickselect(docVectorArray, k); foreach (Tuple <string, double> pair in documentVector) { if (pair.Item2 >= kthLargest.Item2) { // add to keys list if tfidf is greater than kth largest megaKeysList.Add(pair.Item1); } } counter++; } // now megakeyslist contains only the top 50% tfidf words from each document, change into list to generate an ordering List <string> wordsList = megaKeysList.ToList(); // [][] that will store all document vectors double[][] TFIDVectors = new double[documents.Count][]; //loop through documents again and create vector for each for (int j = 0; j < documents.Count; j++) { Debug.WriteLine("Generating actual vectors for : " + j); double[] newDocumentVector = new double[wordsList.Count]; for (int i = 0; i < wordsList.Count; i++) { newDocumentVector[i] = documents[j].ReturnFrequency(wordsList[i]) == 0 ? 0 : 1; } TFIDVectors[j] = newDocumentVector; } return(TFIDVectors); }