Beispiel #1
0
        /// <summary>Read the Word2Vec word vector flat txt file.</summary>
        /// <param name="file">The word2vec text file.</param>
        /// <returns>The word vectors in the file.</returns>
        public static VectorMap ReadWord2Vec(string file)
        {
            VectorMap vectors = new VectorMap();
            int       dim     = -1;

            foreach (string line in IOUtils.ReadLines(file))
            {
                string[] split = line.ToLower().Split("\\s+");
                if (split.Length < 100)
                {
                    continue;
                }
                float[] vector = new float[split.Length - 1];
                if (dim == -1)
                {
                    dim = vector.Length;
                }
                System.Diagnostics.Debug.Assert(dim == vector.Length);
                for (int i = 1; i < split.Length; i++)
                {
                    vector[i - 1] = float.ParseFloat(split[i]);
                }
                ArrayMath.L2normalize(vector);
                vectors[split[0]] = vector;
            }
            return(vectors);
        }
Beispiel #2
0
        /// <summary>Read word vectors from an input stream.</summary>
        /// <remarks>Read word vectors from an input stream. The stream is not closed on finishing the function.</remarks>
        /// <param name="in">The stream to read from. This is not closed.</param>
        /// <returns>The word vectors encoded on the stream.</returns>
        /// <exception cref="System.IO.IOException">Thrown if we could not read from the stream.</exception>
        public static VectorMap Deserialize(InputStream @in)
        {
            DataInputStream dataIn = new DataInputStream(@in);

            // Read the max key length
            VectorMap.Itype keyIntType = VectorMap.Itype.GetType(dataIn.ReadInt());
            // Read the vector dimensionality
            int dim = dataIn.ReadInt();
            // Read the size of the dataset
            int size = dataIn.ReadInt();
            // Read the vectors
            VectorMap vectors = new VectorMap();

            for (int i = 0; i < size; ++i)
            {
                // Read the key
                int    strlen = keyIntType.Read(dataIn);
                byte[] buffer = new byte[strlen];
                if (dataIn.Read(buffer, 0, strlen) != strlen)
                {
                    throw new IOException("Could not read string buffer fully!");
                }
                string key = Sharpen.Runtime.GetStringForBytes(buffer);
                // Read the vector
                float[] vector = new float[dim];
                for (int k = 0; k < vector.Length; ++k)
                {
                    vector[k] = ToFloat(dataIn.ReadShort());
                }
                // Add the key/value
                vectors[key] = vector;
            }
            return(vectors);
        }