public Payload RunPlugin(Payload Input, int ThreadsAvailable) { using (ThreadsafeOutputWriter OutputWriter = new ThreadsafeOutputWriter(OutputLocation, Encoding.GetEncoding(SelectedEncoding.ToString()), FileMode.Create)) { //write the header here string HeaderRow = Quotes + "Word" + Quotes; for (int i = 0; i < OutputHeaderData.Keys.Count; i++) { HeaderRow += Delimiter + Quotes + OutputHeaderData[i].Replace(Quotes, Quotes + Quotes) + Quotes; } OutputWriter.WriteString(HeaderRow); //read the first row of input file var lines = File.ReadLines(IncomingTextLocation, Encoding.GetEncoding(SelectedEncoding)); if (VocabSize != -1) { lines = lines.Skip(1); } int LineNumber = 0; TimeSpan reportPeriod = TimeSpan.FromMinutes(0.01); using (new System.Threading.Timer( _ => SetUpdate(LineNumber), null, reportPeriod, reportPeriod)) { Parallel.ForEach((IEnumerable <object>)lines, new ParallelOptions { MaxDegreeOfParallelism = ThreadsAvailable }, (line, state) => { string linetosplit = ((string)line).TrimEnd(); string[] splitLine = (linetosplit).Split(new[] { ' ' }); //string[] splitLine = (linetosplit).Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); string RowWord = splitLine[0].Trim(); double[] RowVector = new double[VectorSize]; for (int i = 0; i < VectorSize; i++) { RowVector[i] = Double.Parse(splitLine[i + 1]); } //let's calculate the cosine similarity between our mean vector //and the token on the current row //calculate cosine Similarities double[] cosSims = new double[WordGroupVectors.Keys.Count]; bool WriteRow = false; for (int wordlist_counter = 0; wordlist_counter < WordGroupVectors.Keys.Count; wordlist_counter++) { //https://janav.wordpress.com/2013/10/27/tf-idf-and-cosine-similarity/ //Cosine Similarity (d1, d2) = Dot product(d1, d2) / ||d1|| * ||d2|| // //Dot product (d1,d2) = d1[0] * d2[0] + d1[1] * d2[1] * … * d1[n] * d2[n] //||d1|| = square root(d1[0]2 + d1[1]2 + ... + d1[n]2) //||d2|| = square root(d2[0]2 + d2[1]2 + ... + d2[n]2) double dotproduct = 0; double d1 = 0; double d2 = 0; //calculate cosine similarity components for (int i = 0; i < VectorSize; i++) { dotproduct += WordGroupVectors[wordlist_counter][i] * RowVector[i]; d1 += WordGroupVectors[wordlist_counter][i] * WordGroupVectors[wordlist_counter][i]; d2 += RowVector[i] * RowVector[i]; } cosSims[wordlist_counter] = dotproduct / (Math.Sqrt(d1) * Math.Sqrt(d2)); if (Math.Abs(cosSims[wordlist_counter]) >= CosineCutoff) { WriteRow = true; } } if (WriteRow) { StringBuilder outputRow = new StringBuilder(); outputRow.Append(Quotes + RowWord.Replace(Quotes, Quotes + Quotes) + Quotes); for (int i = 0; i < cosSims.Length; i++) { outputRow.Append(Delimiter + cosSims[i].ToString()); } OutputWriter.WriteString(outputRow.ToString()); } Interlocked.Increment(ref LineNumber); }); } //end outputwriter } return(new Payload()); }
public Payload RunPlugin(Payload Input, int ThreadsAvailable) { uint WordsProcessed = 0; List <string> WordList = new List <string>(); Dictionary <int, string> CategoryNumNameMap = new Dictionary <int, string>(); Dictionary <string, List <int> > WordCategoryMap = new Dictionary <string, List <int> >(); TimeSpan reportPeriod = TimeSpan.FromMinutes(0.01); using (new System.Threading.Timer( _ => SetUpdate(WordsProcessed), null, reportPeriod, reportPeriod)) { if (CSVStyle == "Poster") { #region Poster Style CSV Reading //read in all of the basic dictionary data from the CSV file using (var stream = File.OpenRead(IncomingTextLocation)) using (var reader = new StreamReader(stream, encoding: Encoding.GetEncoding(SelectedEncoding))) { var data = CsvParser.ParseHeadAndTail(reader, Delimiter[0], Quote[0]); //populate the header names and categories. might not end up being necessary for (int i = 0; i < header.Length; i++) { if (!String.IsNullOrWhiteSpace(header[i].Trim())) { CategoryNumNameMap.Add(i + 1, header[i].Trim()); } } var lines = data.Item2; foreach (var line in lines) { try { for (int i = 0; i < line.Count(); i++) { //we only want to add the word if we've actually got a corresponding //header to go with the column that the word is in. if (CategoryNumNameMap.ContainsKey(i + 1)) { string word = line[i].Trim(); if (ConvertToLower) { word = word.ToLower(); } if (string.IsNullOrWhiteSpace(word)) { continue; } if (WordCategoryMap.ContainsKey(word)) { if (WordCategoryMap[word].Contains(i + 1)) { continue; } WordCategoryMap[word].Add(i + 1); } else { WordCategoryMap.Add(word, new List <int>() { i + 1 }); WordList.Add(word); } WordsProcessed++; } } } catch { } } } #endregion } else { #region Table Style CSV Reading //read in all of the basic dictionary data from the CSV file using (var stream = File.OpenRead(IncomingTextLocation)) using (var reader = new StreamReader(stream, encoding: Encoding.GetEncoding(SelectedEncoding))) { var data = CsvParser.ParseHeadAndTail(reader, Delimiter[0], Quote[0]); //populate the header names and categories. might not end up being necessary for (int i = 0; i < header.Length; i++) { if (i > 0 && !String.IsNullOrWhiteSpace(header[i].Trim())) { CategoryNumNameMap.Add(i, header[i].Trim()); } } var lines = data.Item2; foreach (var line in lines) { try { string word = line[0].Trim(); if (ConvertToLower) { word = word.ToLower(); } if (string.IsNullOrWhiteSpace(word)) { continue; } for (int i = 1; i < line.Count(); i++) { //we only want to add the word if we've actually got a corresponding //header to go with the column that the word is in. if (CategoryNumNameMap.ContainsKey(i) && !String.IsNullOrWhiteSpace(line[i])) { if (WordCategoryMap.ContainsKey(word)) { if (WordCategoryMap[word].Contains(i)) { continue; } WordCategoryMap[word].Add(i); } else { WordCategoryMap.Add(word, new List <int>() { i }); WordList.Add(word); } WordsProcessed++; } } } catch { } } } #endregion } WordList.Sort(); using (ThreadsafeOutputWriter OutputWriter = new ThreadsafeOutputWriter(OutputLocation, Encoding.GetEncoding(SelectedEncoding.ToString()), FileMode.Create)) { OutputWriter.WriteString("%"); //write the header for (int i = 0; i < CategoryNumNameMap.Count(); i++) { string rowToWrite = (i + 1).ToString() + "\t" + CategoryNumNameMap[i + 1]; OutputWriter.WriteString(rowToWrite); } OutputWriter.WriteString("%"); //write the dictionary body for (int i = 0; i < WordList.Count(); i++) { WordCategoryMap[WordList[i]].Sort(); string[] categoryArray = WordCategoryMap[WordList[i]].Select(x => x.ToString()).ToArray(); string rowToWrite = WordList[i] + "\t" + String.Join("\t", categoryArray); OutputWriter.WriteString(rowToWrite); } } } return(new Payload()); }
public Payload RunPlugin(Payload Input, int ThreadsAvailable) { DictionaryData ParsedDict = new DictionaryData(); try { DictParser DP = new DictParser(); ParsedDict = DP.ParseDict(DictDataRawMeta); } catch { MessageBox.Show("There was an error trying to parse your dictionary file. Please make sure that your dictionary file is correctly formatted.", "Error Parsing Dictionary", MessageBoxButtons.OK, MessageBoxIcon.Error); return(new Payload()); } //ParsedDict.FullDictionary structure //--"Wildcards" //----int Word Count //--------Words //----------categories[] //--"Standards" //----int Word Count //--------Words //----------categories[] NumCats = ParsedDict.NumCats; uint WordsProcessed = 0; TimeSpan reportPeriod = TimeSpan.FromMinutes(0.01); using (new System.Threading.Timer( _ => SetUpdate(WordsProcessed), null, reportPeriod, reportPeriod)) { try { using (ThreadsafeOutputWriter OutputWriter = new ThreadsafeOutputWriter(OutputLocation, Encoding.GetEncoding(SelectedEncoding.ToString()), FileMode.Create)) { #region set up and write the header string[] header; if (CSVStyle == "Poster") { header = ParsedDict.CatNames; } else { header = new string[ParsedDict.NumCats + 1]; header[0] = "Entry"; for (int i = 0; i < ParsedDict.NumCats; i++) { header[i + 1] = ParsedDict.CatNames[i]; } } for (int i = 0; i < header.Length; i++) { header[i] = Quote + header[i] + Quote; } OutputWriter.WriteString(String.Join(Delimiter, header)); #endregion if (CSVStyle == "Poster") { #region write poster style csv Dictionary <string, int> CatIndices = new Dictionary <string, int>(); //for (int i = 0; i < ParsedDict.CatValues.Length; i++) CatIndices.Add(ParsedDict.CatValues[i], i); for (int i = 0; i < ParsedDict.CatValues.Length; i++) { //we have to make sure that the category mapped key doesn't already exist in the CatIndices variable //otherwise, a person can accidentally re-use the same category number (e.g., 10) for multiple categories, //which will screw things up if (!CatIndices.ContainsKey(ParsedDict.CatValues[i])) { CatIndices.Add(ParsedDict.CatValues[i], i); } else { MessageBox.Show("Your dictionary file appears to use the same code to refer to muliple categories (" + ParsedDict.CatValues[i] + "). All categories that use this code will be omitted, except for the first category that used this code.", "Dictionary Formatting Error", MessageBoxButtons.OK, MessageBoxIcon.Error); } } //initialize our word map List <List <string> > WordMap = new List <List <string> >(); for (int i = 0; i < NumCats; i++) { WordMap.Add(new List <string>()); } if (ParsedDict.FullDictionary.ContainsKey("Wildcards")) { foreach (int wordcount in ParsedDict.FullDictionary["Wildcards"].Keys) { foreach (string word in ParsedDict.FullDictionary["Wildcards"][wordcount].Keys) { for (int i = 0; i < ParsedDict.FullDictionary["Wildcards"][wordcount][word].Length; i++) { if (CatIndices.ContainsKey(ParsedDict.FullDictionary["Wildcards"][wordcount][word][i])) { WordMap[CatIndices[ParsedDict.FullDictionary["Wildcards"][wordcount][word][i]]].Add(Quote + word.Replace(Quote, Quote + Quote) + Quote); } } WordsProcessed++; } } } if (ParsedDict.FullDictionary.ContainsKey("Standards")) { foreach (int wordcount in ParsedDict.FullDictionary["Standards"].Keys) { foreach (string word in ParsedDict.FullDictionary["Standards"][wordcount].Keys) { for (int i = 0; i < ParsedDict.FullDictionary["Standards"][wordcount][word].Length; i++) { if (CatIndices.ContainsKey(ParsedDict.FullDictionary["Standards"][wordcount][word][i])) { WordMap[CatIndices[ParsedDict.FullDictionary["Standards"][wordcount][word][i]]].Add(Quote + word.Replace(Quote, Quote + Quote) + Quote); } } WordsProcessed++; } } } //now that we've populated the word map, we can clean some things up //first, wipe out the parseddict ParsedDict = new DictionaryData(); //now we sort the word lists and figure out our array size that we're going to write int MaxWords = 0; for (int i = 0; i < NumCats; i++) { WordMap[i].Sort(); int wordCount = WordMap[i].Count; if (wordCount > MaxWords) { MaxWords = wordCount; } } //OutputArray[Cols,Rows] string[][] OutputArray = new string[MaxWords][]; //initialize array with empty strings for (int i = 0; i < MaxWords; i++) { OutputArray[i] = new string[NumCats]; for (int j = 0; j < NumCats; j++) { OutputArray[i][j] = ""; } } //now we populate the array with the words from the word map for (int i = 0; i < NumCats; i++) { for (int j = 0; j < WordMap[i].Count; j++) { OutputArray[j][i] = WordMap[i][j]; } } WordMap.Clear(); //finally, write the data for (int i = 0; i < MaxWords; i++) { OutputWriter.WriteString(String.Join(Delimiter, OutputArray[i])); } #endregion } else { #region write table style csv //set up a dictionary to track which columns the output gets written to Dictionary <string, int> CatIndices = new Dictionary <string, int>(); for (int i = 0; i < ParsedDict.CatValues.Length; i++) { //we have to make sure that the category mapped key doesn't already exist in the CatIndices variable //otherwise, a person can accidentally re-use the same category number (e.g., 10) for multiple categories, //which will screw things up if (!CatIndices.ContainsKey(ParsedDict.CatValues[i])) { CatIndices.Add(ParsedDict.CatValues[i], i + 1); } else { MessageBox.Show("Your dictionary file appears to use the same code to refer to muliple categories (" + ParsedDict.CatValues[i] + "). All categories that use this code will be omitted, except for the first category that used this code.", "Dictionary Formatting Error", MessageBoxButtons.OK, MessageBoxIcon.Error); } } List <string> WordList = new List <string>(); Dictionary <string, string[]> WordListUnpacked = new Dictionary <string, string[]>(); #region deconstruct dictionary object //it's kind of taking the long way around to basically read in the dictionary, parse it out into its own object, //then deconstruct that object into additional lists/dictionaries. I might change this in the future, however, I //currently feel that this is a fairly robust way to make sure that everything is parsed out properly before //trying to reassemble into a table. Inefficient? Yes, but it allows me to recycle a lot of other code that I've written foreach (int wordcount in ParsedDict.FullDictionary["Wildcards"].Keys) { foreach (string word in ParsedDict.FullDictionary["Wildcards"][wordcount].Keys) { WordList.Add(word); WordListUnpacked.Add(word, ParsedDict.FullDictionary["Wildcards"][wordcount][word]); WordsProcessed++; } } foreach (int wordcount in ParsedDict.FullDictionary["Standards"].Keys) { foreach (string word in ParsedDict.FullDictionary["Standards"][wordcount].Keys) { WordList.Add(word); WordListUnpacked.Add(word, ParsedDict.FullDictionary["Standards"][wordcount][word]); WordsProcessed++; } } //we can wipe this out now that we're done with it ParsedDict = new DictionaryData(); #endregion WordList.Sort(); //now we go back and iterate over everything to write it out as a table for (int i = 0; i < WordList.Count; i++) { string word = WordList[i]; //initialize new array with empty strings string[] RowToWrite = new string[NumCats + 1]; for (int j = 0; j < NumCats + 1; j++) { RowToWrite[j] = ""; } RowToWrite[0] = Quote + word.Replace(Quote, Quote + Quote) + Quote; for (int j = 0; j < WordListUnpacked[word].Length; j++) { if (CatIndices.ContainsKey(WordListUnpacked[word][j])) { RowToWrite[CatIndices[WordListUnpacked[word][j]]] = "X"; } } OutputWriter.WriteString(String.Join(Delimiter, RowToWrite)); } #endregion } } } catch { MessageBox.Show("There was a problem writing your dictionary to a CSV file. Is your CSV file currently open in another application?", "CSV Write Error", MessageBoxButtons.OK, MessageBoxIcon.Error); } } return(new Payload()); }