예제 #1
0
        public Dictionary <string, string> ExportSettings(bool suppressWarnings)
        {
            Dictionary <string, string> SettingsDict = new Dictionary <string, string>();

            SettingsDict.Add("SelectedEncoding", SelectedEncoding);
            SettingsDict.Add("IncomingTextLocation", IncomingTextLocation);
            SettingsDict.Add("OutputLocation", OutputLocation);
            SettingsDict.Add("VocabSize", VocabSize.ToString());
            SettingsDict.Add("VectorSize", VectorSize.ToString());
            SettingsDict.Add("CosineCutoff", CosineCutoff.ToString());

            int WordListLength = 0;

            if (WordList != null)
            {
                WordListLength = WordList.Length;
            }

            SettingsDict.Add("WordListLength", WordListLength.ToString());

            for (int i = 0; i < WordListLength; i++)
            {
                SettingsDict.Add("WordList" + i.ToString(), WordList[i]);
            }

            return(SettingsDict);
        }
예제 #2
0
        public Dictionary <string, string> ExportSettings(bool suppressWarnings)
        {
            Dictionary <string, string> SettingsDict = new Dictionary <string, string>();

            SettingsDict.Add("InputModelFilename", InputModelFilename);
            SettingsDict.Add("SelectedEncoding", SelectedEncoding);
            SettingsDict.Add("VocabSize", VocabSize.ToString());
            SettingsDict.Add("VectorSize", VectorSize.ToString());

            return(SettingsDict);
        }
예제 #3
0
        private void SetFolderButton_Click(object sender, System.EventArgs e)
        {
            using (var dialog = new OpenFileDialog())
            {
                dialog.Multiselect     = false;
                dialog.CheckFileExists = true;
                dialog.CheckPathExists = true;
                dialog.ValidateNames   = true;
                dialog.Title           = "Please choose the model file that you would like to read";
                dialog.FileName        = "Model.txt";
                dialog.Filter          = "Word Embedding Model (.txt,.vec)|*.txt;*.vec";
                if (dialog.ShowDialog() == DialogResult.OK)
                {
                    try
                    {
                        using (var stream = File.OpenRead(dialog.FileName))
                            using (var reader = new StreamReader(stream, encoding: Encoding.GetEncoding(EncodingDropdown.SelectedItem.ToString())))
                            {
                                string[] firstLine = reader.ReadLine().Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);

                                if (firstLine.Length == 2)
                                {
                                    VocabSize  = int.Parse(firstLine[0]);
                                    VectorSize = int.Parse(firstLine[1]);
                                    ModelDetailsTextbox.Text = "Vocab size: " + firstLine[0] + "; Vector Size: " + firstLine[1];
                                }
                                else
                                {
                                    VectorSize = firstLine.Length - 1;
                                    VocabSize  = -1;
                                    ModelDetailsTextbox.Text = "Vocab size: unknown; Vector Size: " + VectorSize.ToString();
                                }



                                SelectedFileTextbox.Text = dialog.FileName;
                            }
                    }
                    catch
                    {
                        MessageBox.Show("There was an error while trying to read your word embedding model. It is possible that your file is not correctly formatted, or that your model file is open in another program.", "Error reading model", MessageBoxButtons.OK, MessageBoxIcon.Error);
                        return;
                    }
                }
            }
        }
예제 #4
0
        public void Initialize()
        {
            TotalNumRows = 0;
            string leadingZeroes = "D" + VectorSize.ToString().Length.ToString();


            //we could use a List<double[]> to load in the word vectors, then
            //just .ToArray() it to make jagged arrays. However, I *really* want to avoid
            //having to hold the model in memory twice
            WordToArrayMap = new Dictionary <string, int>();
            if (VocabSize != -1)
            {
                model = new double[VocabSize][];
            }

            try
            {
                #region capture dictionary words and initialize model, if vocabsize is known
                //now, during initialization, we actually go through and want to establish the word group vectors
                using (var stream = File.OpenRead(InputModelFilename))
                    using (var reader = new StreamReader(stream, encoding: Encoding.GetEncoding(SelectedEncoding)))
                    {
                        if (VocabSize != -1)
                        {
                            string[] firstLine = reader.ReadLine().Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
                        }

                        int WordsFound = 0;

                        while (!reader.EndOfStream)
                        {
                            string   line      = reader.ReadLine().TrimEnd();
                            string[] splitLine = line.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
                            string   RowWord   = splitLine[0].Trim();
                            double[] RowVector = new double[VectorSize];
                            for (int i = 0; i < VectorSize; i++)
                            {
                                RowVector[i] = Double.Parse(splitLine[i + 1]);
                            }

                            if (!WordToArrayMap.ContainsKey(RowWord))
                            {
                                WordToArrayMap.Add(RowWord, TotalNumRows);
                                if (VocabSize != -1)
                                {
                                    model[TotalNumRows] = RowVector;
                                }
                            }

                            TotalNumRows++;
                        }
                    }


                #endregion



                //if we didn't know the vocab size initially, we know it now that we've walked the whole model
                #region if vocab size was unknown, now we load up the whole model into memory
                if (VocabSize == -1)
                {
                    model        = new double[TotalNumRows][];
                    TotalNumRows = 0;

                    //now, during initialization, we actually go through and want to establish the word group vectors
                    using (var stream = File.OpenRead(InputModelFilename))
                        using (var reader = new StreamReader(stream, encoding: Encoding.GetEncoding(SelectedEncoding)))
                        {
                            while (!reader.EndOfStream)
                            {
                                string   line      = reader.ReadLine().TrimEnd();
                                string[] splitLine = line.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
                                string   RowWord   = splitLine[0].Trim();
                                double[] RowVector = new double[VectorSize];
                                for (int i = 0; i < VectorSize; i++)
                                {
                                    RowVector[i] = Double.Parse(splitLine[i + 1]);
                                }

                                if (WordToArrayMap.ContainsKey(RowWord))
                                {
                                    model[TotalNumRows] = RowVector;
                                }

                                TotalNumRows++;
                            }
                        }
                }
                #endregion
            }
            catch (OutOfMemoryException OOM)
            {
                MessageBox.Show("Plugin Error: Latent Semantic Similarity. This plugin encountered an \"Out of Memory\" error while trying to load your pre-trained model. More than likely, you do not have enough RAM in your computer to hold this model in memory. Consider using a model with a smaller vocabulary or fewer dimensions.", "Plugin Error (Out of Memory)", MessageBoxButtons.OK, MessageBoxIcon.Error);
            }

            tokenizer = new TwitterAwareTokenizer();
        }