internal virtual void ReadWordVectors() { Embedding embedding = new Embedding(op.wordVectors, op.numHid); this.wordVectors = Generics.NewTreeMap(); // Map<String, SimpleMatrix> rawWordVectors = NeuralUtils.readRawWordVectors(op.wordVectors, op.numHid); // for (String word : rawWordVectors.keySet()) { foreach (string word in embedding.KeySet()) { // TODO: factor out unknown word vector code from DVParser wordVectors[word] = embedding.Get(word); } string unkWord = op.unkWord; SimpleMatrix unknownWordVector = wordVectors[unkWord]; wordVectors[UnknownWord] = unknownWordVector; if (unknownWordVector == null) { throw new Exception("Unknown word vector not specified in the word vector file"); } }
public virtual void ReadWordVectors() { SimpleMatrix unknownNumberVector = null; SimpleMatrix unknownCapsVector = null; SimpleMatrix unknownChineseYearVector = null; SimpleMatrix unknownChineseNumberVector = null; SimpleMatrix unknownChinesePercentVector = null; wordVectors = Generics.NewTreeMap(); int numberCount = 0; int capsCount = 0; int chineseYearCount = 0; int chineseNumberCount = 0; int chinesePercentCount = 0; //Map<String, SimpleMatrix> rawWordVectors = NeuralUtils.readRawWordVectors(op.lexOptions.wordVectorFile, op.lexOptions.numHid); Embedding rawWordVectors = new Embedding(op.lexOptions.wordVectorFile, op.lexOptions.numHid); foreach (string word in rawWordVectors.KeySet()) { SimpleMatrix vector = rawWordVectors.Get(word); if (op.wordFunction != null) { word = op.wordFunction.Apply(word); } wordVectors[word] = vector; if (op.lexOptions.numHid <= 0) { op.lexOptions.numHid = vector.GetNumElements(); } // TODO: factor out all of these identical blobs if (op.trainOptions.unknownNumberVector && (NumberPattern.Matcher(word).Matches() || DgPattern.Matcher(word).Matches())) { ++numberCount; if (unknownNumberVector == null) { unknownNumberVector = new SimpleMatrix(vector); } else { unknownNumberVector = unknownNumberVector.Plus(vector); } } if (op.trainOptions.unknownCapsVector && CapsPattern.Matcher(word).Matches()) { ++capsCount; if (unknownCapsVector == null) { unknownCapsVector = new SimpleMatrix(vector); } else { unknownCapsVector = unknownCapsVector.Plus(vector); } } if (op.trainOptions.unknownChineseYearVector && ChineseYearPattern.Matcher(word).Matches()) { ++chineseYearCount; if (unknownChineseYearVector == null) { unknownChineseYearVector = new SimpleMatrix(vector); } else { unknownChineseYearVector = unknownChineseYearVector.Plus(vector); } } if (op.trainOptions.unknownChineseNumberVector && (ChineseNumberPattern.Matcher(word).Matches() || DgPattern.Matcher(word).Matches())) { ++chineseNumberCount; if (unknownChineseNumberVector == null) { unknownChineseNumberVector = new SimpleMatrix(vector); } else { unknownChineseNumberVector = unknownChineseNumberVector.Plus(vector); } } if (op.trainOptions.unknownChinesePercentVector && ChinesePercentPattern.Matcher(word).Matches()) { ++chinesePercentCount; if (unknownChinesePercentVector == null) { unknownChinesePercentVector = new SimpleMatrix(vector); } else { unknownChinesePercentVector = unknownChinesePercentVector.Plus(vector); } } } string unkWord = op.trainOptions.unkWord; if (op.wordFunction != null) { unkWord = op.wordFunction.Apply(unkWord); } SimpleMatrix unknownWordVector = wordVectors[unkWord]; wordVectors[UnknownWord] = unknownWordVector; if (unknownWordVector == null) { throw new Exception("Unknown word vector not specified in the word vector file"); } if (op.trainOptions.unknownNumberVector) { if (numberCount > 0) { unknownNumberVector = unknownNumberVector.Divide(numberCount); } else { unknownNumberVector = new SimpleMatrix(unknownWordVector); } wordVectors[UnknownNumber] = unknownNumberVector; } if (op.trainOptions.unknownCapsVector) { if (capsCount > 0) { unknownCapsVector = unknownCapsVector.Divide(capsCount); } else { unknownCapsVector = new SimpleMatrix(unknownWordVector); } wordVectors[UnknownCaps] = unknownCapsVector; } if (op.trainOptions.unknownChineseYearVector) { log.Info("Matched " + chineseYearCount + " chinese year vectors"); if (chineseYearCount > 0) { unknownChineseYearVector = unknownChineseYearVector.Divide(chineseYearCount); } else { unknownChineseYearVector = new SimpleMatrix(unknownWordVector); } wordVectors[UnknownChineseYear] = unknownChineseYearVector; } if (op.trainOptions.unknownChineseNumberVector) { log.Info("Matched " + chineseNumberCount + " chinese number vectors"); if (chineseNumberCount > 0) { unknownChineseNumberVector = unknownChineseNumberVector.Divide(chineseNumberCount); } else { unknownChineseNumberVector = new SimpleMatrix(unknownWordVector); } wordVectors[UnknownChineseNumber] = unknownChineseNumberVector; } if (op.trainOptions.unknownChinesePercentVector) { log.Info("Matched " + chinesePercentCount + " chinese percent vectors"); if (chinesePercentCount > 0) { unknownChinesePercentVector = unknownChinesePercentVector.Divide(chinesePercentCount); } else { unknownChinesePercentVector = new SimpleMatrix(unknownWordVector); } wordVectors[UnknownChinesePercent] = unknownChinesePercentVector; } if (op.trainOptions.useContextWords) { SimpleMatrix start = SimpleMatrix.Random(op.lexOptions.numHid, 1, -0.5, 0.5, rand); SimpleMatrix end = SimpleMatrix.Random(op.lexOptions.numHid, 1, -0.5, 0.5, rand); wordVectors[StartWord] = start; wordVectors[EndWord] = end; } }