static void ParseWord2Vec(string word2vecFile, string parsedFile) { var writer = new LargeFileWriter(parsedFile, FileMode.Create); var parser = new ParseBinaryVector(word2vecFile); int count = 0; while (!parser.EOF) { if (++count % 1000 == 0) { Console.WriteLine(count); } try { var pair = parser.GetNextVector(); var index = pair.first.IndexOf("en/"); writer.Write(pair.first.Substring(index + 3)); foreach (var value in pair.second) { writer.Write(string.Format(" {0}", value)); } writer.Write("\r"); } catch (Exception) { continue; } } writer.Close(); }
public static void SelectInterestWordVector(string interestWordFile, string word2vecFile, string compressedWord2VectorFile) { var reader = new LargeFileReader(interestWordFile); string line; var set = new HashSet <string>(); while ((line = reader.ReadLine()) != null) { set.Add(line.Trim()); } reader.Close(); var writer = new LargeFileWriter(compressedWord2VectorFile, FileMode.Create); var parser = new ParseBinaryVector(word2vecFile); int count = 0; while (!parser.EOF) { if (++count % 1000 == 0) { Console.WriteLine(count); } try { var pair = parser.GetNextVector(); if (set.Contains(pair.first)) { writer.Write(pair.first); foreach (var value in pair.second) { writer.Write(string.Format(" {0}", value)); } writer.Write("\r"); } } catch (Exception) { continue; } } writer.Close(); }