/// <summary> /// Initializes a new instance of the <see cref="Dedup1024KHashAlgorithm"/> class. /// </summary> public Dedup1024KHashAlgorithm(IChunker chunker) : base(chunker) { int expectedAvgChunkSize = TargetHashType.GetAvgChunkSize(); Contract.Check(chunker.Configuration.AvgChunkSize == expectedAvgChunkSize)?.Assert($"Invalid average chunk size (in bytes) specified: {chunker.Configuration.AvgChunkSize} expected: {expectedAvgChunkSize}"); }
private Parser( IMaxentModel buildModel, IMaxentModel attachModel, IMaxentModel checkModel, IPOSTagger tagger, IChunker chunker, AbstractHeadRules headRules, int beamSize, double advancePercentage) : base(tagger, chunker, headRules, beamSize, advancePercentage) { this.buildModel = buildModel; this.attachModel = attachModel; this.checkModel = checkModel; buildContextGenerator = new BuildContextGenerator(); attachContextGenerator = new AttachContextGenerator(punctSet); checkContextGenerator = new CheckContextGenerator(punctSet); bProbs = new double[buildModel.GetNumOutcomes()]; aProbs = new double[attachModel.GetNumOutcomes()]; cProbs = new double[checkModel.GetNumOutcomes()]; doneIndex = buildModel.GetIndex(DONE); sisterAttachIndex = attachModel.GetIndex(ATTACH_SISTER); daughterAttachIndex = attachModel.GetIndex(ATTACH_DAUGHTER); // nonAttachIndex = attachModel.GetIndex(NON_ATTACH); attachments = new[] { daughterAttachIndex, sisterAttachIndex }; completeIndex = checkModel.GetIndex(COMPLETE); }
public void Setup() { var p = new TrainingParameters(); p.Set(Parameters.Iterations, "70"); p.Set(Parameters.Cutoff, "1"); var chunkerModel = ChunkerME.Train("en", CreateSampleStream(), p, new ChunkerFactory()); chunker = new ChunkerME(chunkerModel); }
/// <summary> /// Initializes a new instance of the <see cref="AbstractAnalyzer" /> with the specified weight. /// </summary> /// <param name="chunker">The chunker used in this analyzer.</param> /// <param name="weight">The analyzer weight.</param> /// <exception cref="System.ArgumentNullException">chunker</exception> public ChunkerAnalyzer(IChunker chunker, float weight) : base(weight) { if (chunker == null) throw new ArgumentNullException("chunker"); Chunker = chunker; }
public NLPToolsController() { string modelPath = @"C:\Users\Garrett\Documents\Visual Studio 2015\Projects\MindysTermExtractionLibrary\src\sharpnlp-nbin-files\"; sentenceDetector = new EnglishMaximumEntropySentenceDetector(modelPath + "EnglishSD.nbin"); tokenizer = new EnglishMaximumEntropyTokenizer(modelPath + "EnglishTok.nbin"); posTagger = new EnglishMaximumEntropyPosTagger(modelPath + "EnglishPOS.nbin"); phraseChunker = new EnglishTreebankChunker(modelPath + "EnglishChunk.nbin"); }
/// <nodoc /> public DedupNodeOrChunkHashAlgorithm(IChunker chunker) { if (!ChunkerConfiguration.IsValidChunkSize(chunker.Configuration)) { throw new NotImplementedException($"Unsupported chunk size specified: {chunker.Configuration.AvgChunkSize} in bytes."); } _chunker = chunker; Initialize(); }
/// <summary> /// Initializes a new instance of the <see cref="AbstractAnalyzer" /> with the specified weight. /// </summary> /// <param name="chunker">The chunker used in this analyzer.</param> /// <param name="weight">The analyzer weight.</param> /// <exception cref="System.ArgumentNullException">chunker</exception> public ChunkerAnalyzer(IChunker chunker, float weight) : base(weight) { if (chunker == null) { throw new ArgumentNullException("chunker"); } Chunker = chunker; }
public Session(IChunker chunker, Action <ChunkInfo> callback) { _pushBufferHandle = PushBufferPool.Get(); _pushBuffer = _pushBufferHandle.Value; _chunksSeenHandle = ChunksSeenPool.Get(); _chunksSeen = _chunksSeenHandle.Value; _chunker = chunker; _callback = callback; }
/// <summary> /// Initializes a new instance of the <see cref="AbstractBottomUpParser"/>. /// </summary> /// <param name="tagger">The pos-tagger that the parser uses.</param> /// <param name="chunker">The chunker that the parser uses to chunk non-recursive structures.</param> /// <param name="headRules">The head rules for the parser.</param> /// <param name="beamSize">Size of the beam.</param> /// <param name="advancePercentage">The advance percentage.</param> protected AbstractBottomUpParser(IPOSTagger tagger, IChunker chunker, AbstractHeadRules headRules, int beamSize, double advancePercentage) { this.tagger = tagger; this.chunker = chunker; M = beamSize; K = beamSize; Q = advancePercentage; ReportFailedParse = true; this.headRules = headRules; punctSet = headRules.PunctuationTags; odh = new ListHeap <Parse>(K); ndh = new ListHeap <Parse>(K); completeParses = new ListHeap <Parse>(K); }
public void Setup() { var sParams = new TrainingParameters(); sParams.Set(Parameters.Iterations, "70"); sParams.Set(Parameters.Cutoff, "1"); var jParams = new opennlp.tools.util.TrainingParameters(); jParams.put("Iterations", "70"); jParams.put("Cutoff", "1"); var sModel = ChunkerME.Train("en", ChunkerMETest.CreateSampleStream(), sParams, new ChunkerFactory()); var jModel = opennlp.tools.chunker.ChunkerME.train("en", JavaSampleStream(), jParams, new opennlp.tools.chunker.ChunkerFactory()); Assert.NotNull(sModel); Assert.NotNull(jModel); sChunker = new ChunkerME(sModel); jChunker = new opennlp.tools.chunker.ChunkerME(jModel); }
private DedupNode HashIsStableForChunker(IChunker chunker, uint byteCount, string expectedHash, int seed) { using (var hasher = new DedupNodeHashAlgorithm(chunker)) { byte[] bytes = new byte[byteCount]; if (byteCount > 0) { FillBufferWithTestContent(seed, bytes); } hasher.Initialize(); hasher.ComputeHash(bytes, 0, bytes.Length); var node = hasher.GetNode(); Assert.Equal <long>((long)byteCount, node.EnumerateChunkLeafsInOrder().Sum(c => (long)c.TransitiveContentBytes)); string header = $"Chunker:{chunker.GetType().Name} Seed:{seed} Length:{byteCount} Hash:"; Assert.Equal <string>($"{header}{expectedHash}", $"{header}{node.Hash.ToHex()}"); return(node); } }
private Parser(IMaxentModel buildModel, IMaxentModel checkModel, IPOSTagger tagger, IChunker chunker, AbstractHeadRules headRules, int beamSize, double advancePercentage) : base(tagger, chunker, headRules, beamSize, advancePercentage) { this.buildModel = buildModel; this.checkModel = checkModel; bProbs = new double[buildModel.GetNumOutcomes()]; cProbs = new double[checkModel.GetNumOutcomes()]; buildContextGenerator = new BuildContextGenerator(); checkContextGenerator = new CheckContextGenerator(); startTypeMap = new Dictionary<string, string>(); contTypeMap = new Dictionary<string, string>(); for (int boi = 0, bon = buildModel.GetNumOutcomes(); boi < bon; boi++) { var outcome = buildModel.GetOutcome(boi); if (outcome.StartsWith(START)) { startTypeMap[outcome] = outcome.Substring(START.Length); } else if (outcome.StartsWith(CONT)) { contTypeMap[outcome] = outcome.Substring(CONT.Length); } } topStartIndex = buildModel.GetIndex(TOP_START); completeIndex = checkModel.GetIndex(COMPLETE); incompleteIndex = checkModel.GetIndex(INCOMPLETE); }
private void HashOfChunksInNodeMatchesChunkHashAlgorithmInner(IChunker chunker) { using (var nodeHasher = new DedupNodeHashAlgorithm(chunker)) using (var chunkHasher = new DedupChunkHashAlgorithm()) { byte[] bytes = new byte[2 * DedupNode.MaxDirectChildrenPerNode * (64 * 1024 /* avg chunk size */)]; var r = new Random(Seed: 0); r.NextBytes(bytes); nodeHasher.ComputeHash(bytes, 0, bytes.Length); var node = nodeHasher.GetNode(); Assert.NotNull(node.Height); Assert.Equal((uint)2, node.Height.Value); ulong offset = 0; foreach (var chunkInNode in node.EnumerateChunkLeafsInOrder()) { byte[] chunkHash = chunkHasher.ComputeHash(bytes, (int)offset, (int)chunkInNode.TransitiveContentBytes); Assert.Equal(chunkHash.ToHex(), chunkInNode.Hash.ToHex()); offset += chunkInNode.TransitiveContentBytes; } Assert.Equal(offset, node.TransitiveContentBytes); } }
private Parser(IMaxentModel buildModel, IMaxentModel checkModel, IPOSTagger tagger, IChunker chunker, AbstractHeadRules headRules, int beamSize, double advancePercentage) : base(tagger, chunker, headRules, beamSize, advancePercentage) { this.buildModel = buildModel; this.checkModel = checkModel; bProbs = new double[buildModel.GetNumOutcomes()]; cProbs = new double[checkModel.GetNumOutcomes()]; buildContextGenerator = new BuildContextGenerator(); checkContextGenerator = new CheckContextGenerator(); startTypeMap = new Dictionary <string, string>(); contTypeMap = new Dictionary <string, string>(); for (int boi = 0, bon = buildModel.GetNumOutcomes(); boi < bon; boi++) { var outcome = buildModel.GetOutcome(boi); if (outcome.StartsWith(START)) { startTypeMap[outcome] = outcome.Substring(START.Length); } else if (outcome.StartsWith(CONT)) { contTypeMap[outcome] = outcome.Substring(CONT.Length); } } topStartIndex = buildModel.GetIndex(TOP_START); completeIndex = checkModel.GetIndex(COMPLETE); incompleteIndex = checkModel.GetIndex(INCOMPLETE); }
/// <summary> /// Initializes a new instance of the <see cref="ChunkerAnalyzer"/> class. /// </summary> /// <param name="chunker">The chunker.</param> public ChunkerAnalyzer(IChunker chunker) : this(chunker, 5f) { }
public DeterministicChunker(IChunker chunker) { _chunker = chunker; }
/// <summary> /// Initializes a new instance of the <see cref="AbstractBottomUpParser"/>. /// </summary> /// <param name="tagger">The pos-tagger that the parser uses.</param> /// <param name="chunker">The chunker that the parser uses to chunk non-recursive structures.</param> /// <param name="headRules">The head rules for the parser.</param> /// <param name="beamSize">Size of the beam.</param> /// <param name="advancePercentage">The advance percentage.</param> protected AbstractBottomUpParser(IPOSTagger tagger, IChunker chunker, AbstractHeadRules headRules, int beamSize, double advancePercentage) { this.tagger = tagger; this.chunker = chunker; M = beamSize; K = beamSize; Q = advancePercentage; ReportFailedParse = true; this.headRules = headRules; punctSet = headRules.PunctuationTags; odh = new ListHeap<Parse>(K); ndh = new ListHeap<Parse>(K); completeParses = new ListHeap<Parse>(K); }
public DiffResult CreateDiffs(string oldText, string newText, bool ignoreWhiteSpace, bool ignoreCase, IChunker chunker) { if (oldText == null) { throw new ArgumentNullException(nameof(oldText)); } if (newText == null) { throw new ArgumentNullException(nameof(newText)); } if (chunker == null) { throw new ArgumentNullException(nameof(chunker)); } var pieceHash = new Dictionary <string, int>(); var lineDiffs = new List <DiffBlock>(); var modOld = new ModificationData(oldText); var modNew = new ModificationData(newText); BuildPieceHashes(pieceHash, modOld, ignoreWhiteSpace, ignoreCase, chunker); BuildPieceHashes(pieceHash, modNew, ignoreWhiteSpace, ignoreCase, chunker); BuildModificationData(modOld, modNew); int piecesALength = modOld.HashedPieces.Length; int piecesBLength = modNew.HashedPieces.Length; int posA = 0; int posB = 0; do { while (posA < piecesALength && posB < piecesBLength && !modOld.Modifications[posA] && !modNew.Modifications[posB]) { posA++; posB++; } int beginA = posA; int beginB = posB; for (; posA < piecesALength && modOld.Modifications[posA]; posA++) { ; } for (; posB < piecesBLength && modNew.Modifications[posB]; posB++) { ; } int deleteCount = posA - beginA; int insertCount = posB - beginB; if (deleteCount > 0 || insertCount > 0) { lineDiffs.Add(new DiffBlock(beginA, deleteCount, beginB, insertCount)); } } while (posA < piecesALength && posB < piecesBLength); return(new DiffResult(modOld.Pieces, modNew.Pieces, lineDiffs)); }
/// <summary> /// Initializes a new instance of the <see cref="DedupNodeOrChunkHashAlgorithm"/> class. /// </summary> public DedupNodeOrChunkHashAlgorithm(DedupNodeTree.Algorithm treeAlgorithm, IChunker chunker) { _treeAlgorithm = treeAlgorithm; _chunker = chunker; Initialize(); }
/// <summary> /// Initializes a new instance of the <see cref="DefaultTextMatcher"/> class. /// </summary> /// <param name="differ">The IDiffer implementation to use for determining which text replacements correspond with original text in ambiguous cases.</param> /// <param name="chunker">The IChunker to be used with the differ.</param> public DefaultTextMatcher(IDiffer differ, IChunker chunker) { _chunker = chunker ?? throw new ArgumentNullException(nameof(chunker)); _differ = differ ?? throw new ArgumentNullException(nameof(differ)); }
/// <summary> /// Initializes a new instance of the <see cref="AbstractBottomUpParser"/>. /// </summary> /// <param name="tagger">The pos-tagger that the parser uses.</param> /// <param name="chunker">The chunker that the parser uses to chunk non-recursive structures.</param> /// <param name="headRules">The head rules for the parser.</param> /// <param name="beamSize">Size of the beam.</param> /// <param name="advancePercentage">The advance percentage.</param> public Parser(IPOSTagger tagger, IChunker chunker, AbstractHeadRules headRules, int beamSize, double advancePercentage) : base(tagger, chunker, headRules, beamSize, advancePercentage) { }
/// <summary> /// Initializes a new instance of the <see cref="DedupNodeHashAlgorithm"/> class. /// </summary> public DedupNodeHashAlgorithm(DedupNodeTree.Algorithm treeAlgorithm, IChunker chunker) : base(treeAlgorithm, chunker) { }
/// <summary> /// Initializes a new instance of the <see cref="DedupNodeHashAlgorithm"/> class. /// </summary> public DedupNodeHashAlgorithm(IChunker chunker) : this(DedupNodeTree.Algorithm.MaximallyPacked, chunker) { }
public DiffPaneModel BuildDiffModel(string oldText, string newText, bool ignoreWhitespace, bool ignoreCase, IChunker chunker) { if (oldText == null) { throw new ArgumentNullException(nameof(oldText)); } if (newText == null) { throw new ArgumentNullException(nameof(newText)); } var model = new DiffPaneModel(); var diffResult = differ.CreateDiffs(oldText, newText, ignoreWhitespace, ignoreCase: ignoreCase, chunker); BuildDiffPieces(diffResult, model.Lines); return(model); }
/// <summary> /// Gets the inline textual diffs. /// </summary> /// <param name="oldText">The old text to diff.</param> /// <param name="newText">The new text.</param> /// <param name="ignoreWhiteSpace">true if ignore the white space; othewise, false.</param> /// <param name="ignoreCase">true if case-insensitive; otherwise, false.</param> /// <param name="chunker">The chunker.</param> /// <returns>The diffs result.</returns> public static DiffPaneModel Diff(string oldText, string newText, bool ignoreWhiteSpace = true, bool ignoreCase = false, IChunker chunker = null) { return(Diff(Differ.Instance, oldText, newText, ignoreWhiteSpace, ignoreCase, chunker)); }
/// <summary> /// Initializes a new instance of the <see cref="AbstractBottomUpParser"/>. /// </summary> /// <param name="tagger">The pos-tagger that the parser uses.</param> /// <param name="chunker">The chunker that the parser uses to chunk non-recursive structures.</param> /// <param name="headRules">The head rules for the parser.</param> /// <param name="beamSize">Size of the beam.</param> /// <param name="advancePercentage">The advance percentage.</param> public Parser(IPOSTagger tagger, IChunker chunker, AbstractHeadRules headRules, int beamSize, double advancePercentage) : base(tagger, chunker, headRules, beamSize, advancePercentage) {}
private static void BuildPieceHashes(IDictionary <string, int> pieceHash, ModificationData data, bool ignoreWhitespace, bool ignoreCase, IChunker chunker) { var pieces = string.IsNullOrEmpty(data.RawData) ? emptyStringArray : chunker.Chunk(data.RawData); data.Pieces = pieces; data.HashedPieces = new int[pieces.Length]; data.Modifications = new bool[pieces.Length]; for (int i = 0; i < pieces.Length; i++) { string piece = pieces[i]; if (ignoreWhitespace) { piece = piece.Trim(); } if (ignoreCase) { piece = piece.ToUpperInvariant(); } if (pieceHash.ContainsKey(piece)) { data.HashedPieces[i] = pieceHash[piece]; } else { data.HashedPieces[i] = pieceHash.Count; pieceHash[piece] = pieceHash.Count; } } }
private void HashOfChunksInNodeMatchesChunkHashAlgorithmInner(int expectedChunkCount, ChunkerConfiguration config, IChunker chunker) { using (DedupNodeOrChunkHashAlgorithm nodeHasher = new DedupNodeOrChunkHashAlgorithm(chunker)) using (DedupChunkHashAlgorithm chunkHasher = new DedupChunkHashAlgorithm()) { byte[] bytes = new byte[expectedChunkCount * config.AvgChunkSize]; nodeHasher.SetInputLength(bytes.Length); var r = new Random(Seed: 0); r.NextBytes(bytes); nodeHasher.ComputeHash(bytes, 0, bytes.Length); var node = nodeHasher.GetNode(); Assert.NotNull(node.Height); if (expectedChunkCount >= 2 * DedupNode.MaxDirectChildrenPerNode) { Assert.Equal((uint)2, node.Height.Value); } ulong offset = 0; int chunkCount = 0; foreach (var chunkInNode in node.EnumerateChunkLeafsInOrder()) { byte[] chunkHash = chunkHasher.ComputeHash(bytes, (int)offset, (int)chunkInNode.TransitiveContentBytes); Assert.Equal(chunkHash.ToHex(), chunkInNode.Hash.ToHex()); offset += chunkInNode.TransitiveContentBytes; chunkCount += 1; } Assert.Equal(offset, node.TransitiveContentBytes); double ratio = (1.0 * expectedChunkCount) / chunkCount; Assert.True(Math.Abs(ratio - 1.0) < 0.3); // within 30% of expected } }
/// <summary> /// Gets the inline textual diffs. /// </summary> /// <param name="differ">The differ instance.</param> /// <param name="oldText">The old text to diff.</param> /// <param name="newText">The new text.</param> /// <param name="ignoreWhiteSpace">true if ignore the white space; othewise, false.</param> /// <param name="ignoreCase">true if case-insensitive; otherwise, false.</param> /// <param name="chunker">The chunker.</param> /// <returns>The diffs result.</returns> public static DiffPaneModel Diff(IDiffer differ, string oldText, string newText, bool ignoreWhiteSpace = true, bool ignoreCase = false, IChunker chunker = null) { if (oldText == null) { throw new ArgumentNullException(nameof(oldText)); } if (newText == null) { throw new ArgumentNullException(nameof(newText)); } var model = new DiffPaneModel(); var diffResult = (differ ?? Differ.Instance).CreateDiffs(oldText, newText, ignoreWhiteSpace, ignoreCase, chunker ?? LineChunker.Instance); BuildDiffPieces(diffResult, model.Lines); return(model); }
private Parser( IMaxentModel buildModel, IMaxentModel attachModel, IMaxentModel checkModel, IPOSTagger tagger, IChunker chunker, AbstractHeadRules headRules, int beamSize, double advancePercentage) : base(tagger, chunker, headRules, beamSize, advancePercentage) { this.buildModel = buildModel; this.attachModel = attachModel; this.checkModel = checkModel; buildContextGenerator = new BuildContextGenerator(); attachContextGenerator = new AttachContextGenerator(punctSet); checkContextGenerator = new CheckContextGenerator(punctSet); bProbs = new double[buildModel.GetNumOutcomes()]; aProbs = new double[attachModel.GetNumOutcomes()]; cProbs = new double[checkModel.GetNumOutcomes()]; doneIndex = buildModel.GetIndex(DONE); sisterAttachIndex = attachModel.GetIndex(ATTACH_SISTER); daughterAttachIndex = attachModel.GetIndex(ATTACH_DAUGHTER); // nonAttachIndex = attachModel.GetIndex(NON_ATTACH); attachments = new[] {daughterAttachIndex, sisterAttachIndex}; completeIndex = checkModel.GetIndex(COMPLETE); }
public SideBySideDiffBuilder(IDiffer differ, IChunker lineChunker, IChunker wordChunker) { this.differ = differ ?? Differ.Instance; this.lineChunker = lineChunker ?? throw new ArgumentNullException(nameof(lineChunker)); this.wordChunker = wordChunker ?? throw new ArgumentNullException(nameof(wordChunker)); }
/// <summary> /// Initializes a new instance of the <see cref="DedupNodeOrChunkHashAlgorithm"/> class. /// </summary> public DedupNodeOrChunkHashAlgorithm(DedupNodeTree.Algorithm treeAlgorithm) { _treeAlgorithm = treeAlgorithm; _chunker = DedupNodeHashAlgorithm.CreateChunker(); Initialize(); }
/// <summary> /// Gets the side-by-side textual diffs. /// </summary> /// <param name="differ">The differ instance.</param> /// <param name="oldText">The old text to diff.</param> /// <param name="newText">The new text.</param> /// <param name="ignoreWhiteSpace">true if ignore the white space; othewise, false.</param> /// <param name="ignoreCase">true if case-insensitive; otherwise, false.</param> /// <param name="lineChunker">The line chunker.</param> /// <param name="wordChunker">The word chunker.</param> /// <returns>The diffs result.</returns> public static SideBySideDiffModel Diff(IDiffer differ, string oldText, string newText, bool ignoreWhiteSpace = true, bool ignoreCase = false, IChunker lineChunker = null, IChunker wordChunker = null) { if (oldText == null) { throw new ArgumentNullException(nameof(oldText)); } if (newText == null) { throw new ArgumentNullException(nameof(newText)); } if (differ == null) { return(Diff(oldText, newText, ignoreWhiteSpace, ignoreCase)); } var model = new SideBySideDiffModel(); var diffResult = differ.CreateDiffs(oldText, newText, ignoreWhiteSpace, ignoreCase, lineChunker ?? LineChunker.Instance); BuildDiffPieces(diffResult, model.OldText.Lines, model.NewText.Lines, (ot, nt, op, np, iw, ic) => { var r = differ.CreateDiffs(ot, nt, iw, ic, wordChunker ?? WordChunker.Instance); return(BuildDiffPieces(r, op, np, null, iw, ic)); }, ignoreWhiteSpace, ignoreCase); return(model); }
/// <summary> /// Initializes a new instance of the <see cref="ChunkerEvaluator"/> class. /// </summary> /// <param name="chunker">The chunker.</param> /// <param name="listeners">The evaluation listeners.</param> public ChunkerEvaluator(IChunker chunker, params IEvaluationMonitor <ChunkSample>[] listeners) : base(listeners) { this.chunker = chunker; FMeasure = new FMeasure <Span>(); }
public TestCase(IChunker chunker) => Chunker = chunker;