示例#1
0
        public Payload RunPlugin(Payload Input, int ThreadsAvailable)
        {
            using (ThreadsafeOutputWriter OutputWriter = new ThreadsafeOutputWriter(OutputLocation, Encoding.GetEncoding(SelectedEncoding.ToString()), FileMode.Create))
            {
                //write the header here
                string HeaderRow = Quotes + "Word" + Quotes;
                for (int i = 0; i < OutputHeaderData.Keys.Count; i++)
                {
                    HeaderRow += Delimiter + Quotes + OutputHeaderData[i].Replace(Quotes, Quotes + Quotes) + Quotes;
                }
                OutputWriter.WriteString(HeaderRow);

                //read the first row of input file
                var lines = File.ReadLines(IncomingTextLocation, Encoding.GetEncoding(SelectedEncoding));
                if (VocabSize != -1)
                {
                    lines = lines.Skip(1);
                }

                int LineNumber = 0;



                TimeSpan reportPeriod = TimeSpan.FromMinutes(0.01);
                using (new System.Threading.Timer(
                           _ => SetUpdate(LineNumber),
                           null, reportPeriod, reportPeriod))
                {
                    Parallel.ForEach((IEnumerable <object>)lines,
                                     new ParallelOptions {
                        MaxDegreeOfParallelism = ThreadsAvailable
                    }, (line, state) =>
                    {
                        string linetosplit = ((string)line).TrimEnd();
                        string[] splitLine = (linetosplit).Split(new[] { ' ' });
                        //string[] splitLine = (linetosplit).Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
                        string RowWord = splitLine[0].Trim();


                        double[] RowVector = new double[VectorSize];
                        for (int i = 0; i < VectorSize; i++)
                        {
                            RowVector[i] = Double.Parse(splitLine[i + 1]);
                        }


                        //let's calculate the cosine similarity between our mean vector
                        //and the token on the current row

                        //calculate cosine Similarities
                        double[] cosSims = new double[WordGroupVectors.Keys.Count];
                        bool WriteRow    = false;

                        for (int wordlist_counter = 0; wordlist_counter < WordGroupVectors.Keys.Count; wordlist_counter++)
                        {
                            //https://janav.wordpress.com/2013/10/27/tf-idf-and-cosine-similarity/
                            //Cosine Similarity (d1, d2) =  Dot product(d1, d2) / ||d1|| * ||d2||
                            //
                            //Dot product (d1,d2) = d1[0] * d2[0] + d1[1] * d2[1] * … * d1[n] * d2[n]
                            //||d1|| = square root(d1[0]2 + d1[1]2 + ... + d1[n]2)
                            //||d2|| = square root(d2[0]2 + d2[1]2 + ... + d2[n]2)
                            double dotproduct = 0;
                            double d1         = 0;
                            double d2         = 0;

                            //calculate cosine similarity components
                            for (int i = 0; i < VectorSize; i++)
                            {
                                dotproduct += WordGroupVectors[wordlist_counter][i] * RowVector[i];
                                d1         += WordGroupVectors[wordlist_counter][i] * WordGroupVectors[wordlist_counter][i];
                                d2         += RowVector[i] * RowVector[i];
                            }

                            cosSims[wordlist_counter] = dotproduct / (Math.Sqrt(d1) * Math.Sqrt(d2));

                            if (Math.Abs(cosSims[wordlist_counter]) >= CosineCutoff)
                            {
                                WriteRow = true;
                            }
                        }


                        if (WriteRow)
                        {
                            StringBuilder outputRow = new StringBuilder();
                            outputRow.Append(Quotes + RowWord.Replace(Quotes, Quotes + Quotes) + Quotes);
                            for (int i = 0; i < cosSims.Length; i++)
                            {
                                outputRow.Append(Delimiter + cosSims[i].ToString());
                            }
                            OutputWriter.WriteString(outputRow.ToString());
                        }



                        Interlocked.Increment(ref LineNumber);
                    });
                }



                //end outputwriter
            }

            return(new Payload());
        }
示例#2
0
        public Payload RunPlugin(Payload Input, int ThreadsAvailable)
        {
            uint                             WordsProcessed     = 0;
            List <string>                    WordList           = new List <string>();
            Dictionary <int, string>         CategoryNumNameMap = new Dictionary <int, string>();
            Dictionary <string, List <int> > WordCategoryMap    = new Dictionary <string, List <int> >();


            TimeSpan reportPeriod = TimeSpan.FromMinutes(0.01);

            using (new System.Threading.Timer(
                       _ => SetUpdate(WordsProcessed),
                       null, reportPeriod, reportPeriod))
            {
                if (CSVStyle == "Poster")
                {
                    #region Poster Style CSV Reading
                    //read in all of the basic dictionary data from the CSV file
                    using (var stream = File.OpenRead(IncomingTextLocation))
                        using (var reader = new StreamReader(stream, encoding: Encoding.GetEncoding(SelectedEncoding)))
                        {
                            var data = CsvParser.ParseHeadAndTail(reader, Delimiter[0], Quote[0]);

                            //populate the header names and categories. might not end up being necessary
                            for (int i = 0; i < header.Length; i++)
                            {
                                if (!String.IsNullOrWhiteSpace(header[i].Trim()))
                                {
                                    CategoryNumNameMap.Add(i + 1, header[i].Trim());
                                }
                            }

                            var lines = data.Item2;

                            foreach (var line in lines)
                            {
                                try
                                {
                                    for (int i = 0; i < line.Count(); i++)
                                    {
                                        //we only want to add the word if we've actually got a corresponding
                                        //header to go with the column that the word is in.
                                        if (CategoryNumNameMap.ContainsKey(i + 1))
                                        {
                                            string word = line[i].Trim();

                                            if (ConvertToLower)
                                            {
                                                word = word.ToLower();
                                            }

                                            if (string.IsNullOrWhiteSpace(word))
                                            {
                                                continue;
                                            }


                                            if (WordCategoryMap.ContainsKey(word))
                                            {
                                                if (WordCategoryMap[word].Contains(i + 1))
                                                {
                                                    continue;
                                                }
                                                WordCategoryMap[word].Add(i + 1);
                                            }
                                            else
                                            {
                                                WordCategoryMap.Add(word, new List <int>()
                                                {
                                                    i + 1
                                                });
                                                WordList.Add(word);
                                            }
                                            WordsProcessed++;
                                        }
                                    }
                                }
                                catch
                                {
                                }
                            }
                        }
                    #endregion
                }
                else
                {
                    #region Table Style CSV Reading
                    //read in all of the basic dictionary data from the CSV file
                    using (var stream = File.OpenRead(IncomingTextLocation))
                        using (var reader = new StreamReader(stream, encoding: Encoding.GetEncoding(SelectedEncoding)))
                        {
                            var data = CsvParser.ParseHeadAndTail(reader, Delimiter[0], Quote[0]);

                            //populate the header names and categories. might not end up being necessary
                            for (int i = 0; i < header.Length; i++)
                            {
                                if (i > 0 && !String.IsNullOrWhiteSpace(header[i].Trim()))
                                {
                                    CategoryNumNameMap.Add(i, header[i].Trim());
                                }
                            }

                            var lines = data.Item2;

                            foreach (var line in lines)
                            {
                                try
                                {
                                    string word = line[0].Trim();
                                    if (ConvertToLower)
                                    {
                                        word = word.ToLower();
                                    }
                                    if (string.IsNullOrWhiteSpace(word))
                                    {
                                        continue;
                                    }

                                    for (int i = 1; i < line.Count(); i++)
                                    {
                                        //we only want to add the word if we've actually got a corresponding
                                        //header to go with the column that the word is in.
                                        if (CategoryNumNameMap.ContainsKey(i) && !String.IsNullOrWhiteSpace(line[i]))
                                        {
                                            if (WordCategoryMap.ContainsKey(word))
                                            {
                                                if (WordCategoryMap[word].Contains(i))
                                                {
                                                    continue;
                                                }
                                                WordCategoryMap[word].Add(i);
                                            }
                                            else
                                            {
                                                WordCategoryMap.Add(word, new List <int>()
                                                {
                                                    i
                                                });
                                                WordList.Add(word);
                                            }
                                            WordsProcessed++;
                                        }
                                    }
                                }
                                catch
                                {
                                }
                            }
                        }
                    #endregion
                }



                WordList.Sort();

                using (ThreadsafeOutputWriter OutputWriter = new ThreadsafeOutputWriter(OutputLocation,
                                                                                        Encoding.GetEncoding(SelectedEncoding.ToString()), FileMode.Create))
                {
                    OutputWriter.WriteString("%");

                    //write the header
                    for (int i = 0; i < CategoryNumNameMap.Count(); i++)
                    {
                        string rowToWrite = (i + 1).ToString() + "\t" + CategoryNumNameMap[i + 1];
                        OutputWriter.WriteString(rowToWrite);
                    }

                    OutputWriter.WriteString("%");

                    //write the dictionary body
                    for (int i = 0; i < WordList.Count(); i++)
                    {
                        WordCategoryMap[WordList[i]].Sort();
                        string[] categoryArray = WordCategoryMap[WordList[i]].Select(x => x.ToString()).ToArray();
                        string   rowToWrite    = WordList[i] + "\t" + String.Join("\t", categoryArray);
                        OutputWriter.WriteString(rowToWrite);
                    }
                }
            }



            return(new Payload());
        }
示例#3
0
        public Payload RunPlugin(Payload Input, int ThreadsAvailable)
        {
            DictionaryData ParsedDict = new DictionaryData();

            try
            {
                DictParser DP = new DictParser();
                ParsedDict = DP.ParseDict(DictDataRawMeta);
            }
            catch
            {
                MessageBox.Show("There was an error trying to parse your dictionary file. Please make sure that your dictionary file is correctly formatted.", "Error Parsing Dictionary", MessageBoxButtons.OK, MessageBoxIcon.Error);
                return(new Payload());
            }
            //ParsedDict.FullDictionary structure
            //--"Wildcards"
            //----int Word Count
            //--------Words
            //----------categories[]
            //--"Standards"
            //----int Word Count
            //--------Words
            //----------categories[]
            NumCats = ParsedDict.NumCats;
            uint WordsProcessed = 0;



            TimeSpan reportPeriod = TimeSpan.FromMinutes(0.01);

            using (new System.Threading.Timer(
                       _ => SetUpdate(WordsProcessed),
                       null, reportPeriod, reportPeriod))
            {
                try
                {
                    using (ThreadsafeOutputWriter OutputWriter = new ThreadsafeOutputWriter(OutputLocation,
                                                                                            Encoding.GetEncoding(SelectedEncoding.ToString()), FileMode.Create))
                    {
                        #region set up and write the header
                        string[] header;
                        if (CSVStyle == "Poster")
                        {
                            header = ParsedDict.CatNames;
                        }
                        else
                        {
                            header    = new string[ParsedDict.NumCats + 1];
                            header[0] = "Entry";
                            for (int i = 0; i < ParsedDict.NumCats; i++)
                            {
                                header[i + 1] = ParsedDict.CatNames[i];
                            }
                        }

                        for (int i = 0; i < header.Length; i++)
                        {
                            header[i] = Quote + header[i] + Quote;
                        }

                        OutputWriter.WriteString(String.Join(Delimiter, header));

                        #endregion

                        if (CSVStyle == "Poster")
                        {
                            #region write poster style csv
                            Dictionary <string, int> CatIndices = new Dictionary <string, int>();
                            //for (int i = 0; i < ParsedDict.CatValues.Length; i++) CatIndices.Add(ParsedDict.CatValues[i], i);

                            for (int i = 0; i < ParsedDict.CatValues.Length; i++)
                            {
                                //we have to make sure that the category mapped key doesn't already exist in the CatIndices variable
                                //otherwise, a person can accidentally re-use the same category number (e.g., 10) for multiple categories,
                                //which will screw things up
                                if (!CatIndices.ContainsKey(ParsedDict.CatValues[i]))
                                {
                                    CatIndices.Add(ParsedDict.CatValues[i], i);
                                }
                                else
                                {
                                    MessageBox.Show("Your dictionary file appears to use the same code to refer to muliple categories (" + ParsedDict.CatValues[i] + "). All categories that use this code will be omitted, except for the first category that used this code.", "Dictionary Formatting Error", MessageBoxButtons.OK, MessageBoxIcon.Error);
                                }
                            }



                            //initialize our word map
                            List <List <string> > WordMap = new List <List <string> >();
                            for (int i = 0; i < NumCats; i++)
                            {
                                WordMap.Add(new List <string>());
                            }


                            if (ParsedDict.FullDictionary.ContainsKey("Wildcards"))
                            {
                                foreach (int wordcount in ParsedDict.FullDictionary["Wildcards"].Keys)
                                {
                                    foreach (string word in ParsedDict.FullDictionary["Wildcards"][wordcount].Keys)
                                    {
                                        for (int i = 0; i < ParsedDict.FullDictionary["Wildcards"][wordcount][word].Length; i++)
                                        {
                                            if (CatIndices.ContainsKey(ParsedDict.FullDictionary["Wildcards"][wordcount][word][i]))
                                            {
                                                WordMap[CatIndices[ParsedDict.FullDictionary["Wildcards"][wordcount][word][i]]].Add(Quote + word.Replace(Quote, Quote + Quote) + Quote);
                                            }
                                        }
                                        WordsProcessed++;
                                    }
                                }
                            }
                            if (ParsedDict.FullDictionary.ContainsKey("Standards"))
                            {
                                foreach (int wordcount in ParsedDict.FullDictionary["Standards"].Keys)
                                {
                                    foreach (string word in ParsedDict.FullDictionary["Standards"][wordcount].Keys)
                                    {
                                        for (int i = 0; i < ParsedDict.FullDictionary["Standards"][wordcount][word].Length; i++)
                                        {
                                            if (CatIndices.ContainsKey(ParsedDict.FullDictionary["Standards"][wordcount][word][i]))
                                            {
                                                WordMap[CatIndices[ParsedDict.FullDictionary["Standards"][wordcount][word][i]]].Add(Quote + word.Replace(Quote, Quote + Quote) + Quote);
                                            }
                                        }

                                        WordsProcessed++;
                                    }
                                }
                            }


                            //now that we've populated the word map, we can clean some things up
                            //first, wipe out the parseddict
                            ParsedDict = new DictionaryData();
                            //now we sort the word lists and figure out our array size that we're going to write
                            int MaxWords = 0;
                            for (int i = 0; i < NumCats; i++)
                            {
                                WordMap[i].Sort();
                                int wordCount = WordMap[i].Count;
                                if (wordCount > MaxWords)
                                {
                                    MaxWords = wordCount;
                                }
                            }

                            //OutputArray[Cols,Rows]
                            string[][] OutputArray = new string[MaxWords][];
                            //initialize array with empty strings
                            for (int i = 0; i < MaxWords; i++)
                            {
                                OutputArray[i] = new string[NumCats];
                                for (int j = 0; j < NumCats; j++)
                                {
                                    OutputArray[i][j] = "";
                                }
                            }

                            //now we populate the array with the words from the word map
                            for (int i = 0; i < NumCats; i++)
                            {
                                for (int j = 0; j < WordMap[i].Count; j++)
                                {
                                    OutputArray[j][i] = WordMap[i][j];
                                }
                            }
                            WordMap.Clear();

                            //finally, write the data
                            for (int i = 0; i < MaxWords; i++)
                            {
                                OutputWriter.WriteString(String.Join(Delimiter, OutputArray[i]));
                            }


                            #endregion
                        }
                        else
                        {
                            #region write table style csv

                            //set up a dictionary to track which columns the output gets written to
                            Dictionary <string, int> CatIndices = new Dictionary <string, int>();
                            for (int i = 0; i < ParsedDict.CatValues.Length; i++)
                            {
                                //we have to make sure that the category mapped key doesn't already exist in the CatIndices variable
                                //otherwise, a person can accidentally re-use the same category number (e.g., 10) for multiple categories,
                                //which will screw things up
                                if (!CatIndices.ContainsKey(ParsedDict.CatValues[i]))
                                {
                                    CatIndices.Add(ParsedDict.CatValues[i], i + 1);
                                }
                                else
                                {
                                    MessageBox.Show("Your dictionary file appears to use the same code to refer to muliple categories (" + ParsedDict.CatValues[i] + "). All categories that use this code will be omitted, except for the first category that used this code.", "Dictionary Formatting Error", MessageBoxButtons.OK, MessageBoxIcon.Error);
                                }
                            }


                            List <string> WordList = new List <string>();
                            Dictionary <string, string[]> WordListUnpacked = new Dictionary <string, string[]>();
                            #region deconstruct dictionary object
                            //it's kind of taking the long way around to basically read in the dictionary, parse it out into its own object,
                            //then deconstruct that object into additional lists/dictionaries. I might change this in the future, however, I
                            //currently feel that this is a fairly robust way to make sure that everything is parsed out properly before
                            //trying to reassemble into a table. Inefficient? Yes, but it allows me to recycle a lot of other code that I've written
                            foreach (int wordcount in ParsedDict.FullDictionary["Wildcards"].Keys)
                            {
                                foreach (string word in ParsedDict.FullDictionary["Wildcards"][wordcount].Keys)
                                {
                                    WordList.Add(word);
                                    WordListUnpacked.Add(word, ParsedDict.FullDictionary["Wildcards"][wordcount][word]);
                                    WordsProcessed++;
                                }
                            }
                            foreach (int wordcount in ParsedDict.FullDictionary["Standards"].Keys)
                            {
                                foreach (string word in ParsedDict.FullDictionary["Standards"][wordcount].Keys)
                                {
                                    WordList.Add(word);
                                    WordListUnpacked.Add(word, ParsedDict.FullDictionary["Standards"][wordcount][word]);
                                    WordsProcessed++;
                                }
                            }

                            //we can wipe this out now that we're done with it
                            ParsedDict = new DictionaryData();
                            #endregion

                            WordList.Sort();

                            //now we go back and iterate over everything to write it out as a table
                            for (int i = 0; i < WordList.Count; i++)
                            {
                                string word = WordList[i];
                                //initialize new array with empty strings
                                string[] RowToWrite = new string[NumCats + 1];
                                for (int j = 0; j < NumCats + 1; j++)
                                {
                                    RowToWrite[j] = "";
                                }

                                RowToWrite[0] = Quote + word.Replace(Quote, Quote + Quote) + Quote;
                                for (int j = 0; j < WordListUnpacked[word].Length; j++)
                                {
                                    if (CatIndices.ContainsKey(WordListUnpacked[word][j]))
                                    {
                                        RowToWrite[CatIndices[WordListUnpacked[word][j]]] = "X";
                                    }
                                }

                                OutputWriter.WriteString(String.Join(Delimiter, RowToWrite));
                            }

                            #endregion
                        }
                    }
                }
                catch
                {
                    MessageBox.Show("There was a problem writing your dictionary to a CSV file. Is your CSV file currently open in another application?", "CSV Write Error", MessageBoxButtons.OK, MessageBoxIcon.Error);
                }
            }



            return(new Payload());
        }