/// <summary>Read the Word2Vec word vector flat txt file.</summary> /// <param name="file">The word2vec text file.</param> /// <returns>The word vectors in the file.</returns> public static VectorMap ReadWord2Vec(string file) { VectorMap vectors = new VectorMap(); int dim = -1; foreach (string line in IOUtils.ReadLines(file)) { string[] split = line.ToLower().Split("\\s+"); if (split.Length < 100) { continue; } float[] vector = new float[split.Length - 1]; if (dim == -1) { dim = vector.Length; } System.Diagnostics.Debug.Assert(dim == vector.Length); for (int i = 1; i < split.Length; i++) { vector[i - 1] = float.ParseFloat(split[i]); } ArrayMath.L2normalize(vector); vectors[split[0]] = vector; } return(vectors); }
/// <summary>Read word vectors from an input stream.</summary> /// <remarks>Read word vectors from an input stream. The stream is not closed on finishing the function.</remarks> /// <param name="in">The stream to read from. This is not closed.</param> /// <returns>The word vectors encoded on the stream.</returns> /// <exception cref="System.IO.IOException">Thrown if we could not read from the stream.</exception> public static VectorMap Deserialize(InputStream @in) { DataInputStream dataIn = new DataInputStream(@in); // Read the max key length VectorMap.Itype keyIntType = VectorMap.Itype.GetType(dataIn.ReadInt()); // Read the vector dimensionality int dim = dataIn.ReadInt(); // Read the size of the dataset int size = dataIn.ReadInt(); // Read the vectors VectorMap vectors = new VectorMap(); for (int i = 0; i < size; ++i) { // Read the key int strlen = keyIntType.Read(dataIn); byte[] buffer = new byte[strlen]; if (dataIn.Read(buffer, 0, strlen) != strlen) { throw new IOException("Could not read string buffer fully!"); } string key = Sharpen.Runtime.GetStringForBytes(buffer); // Read the vector float[] vector = new float[dim]; for (int k = 0; k < vector.Length; ++k) { vector[k] = ToFloat(dataIn.ReadShort()); } // Add the key/value vectors[key] = vector; } return(vectors); }