// TODO: it'd be nice to let the FST builder prune based // on term count of each node (the prune1/prune2 that it // accepts), and build the index based on that. This // should result in a more compact terms index, more like // a prefix trie than the other selectors, because it // only stores enough leading bytes to get down to N // terms that may complete that prefix. It becomes // "deeper" when terms are dense, and "shallow" when they // are less dense. // // However, it's not easy to make that work this this // API, because that pruning doesn't immediately know on // seeing each term whether that term will be a seek point // or not. It requires some non-causality in the API, ie // only on seeing some number of future terms will the // builder decide which past terms are seek points. // Somehow the API'd need to be able to return a "I don't // know" value, eg like a Future, which only later on is // flipped (frozen) to true or false. // // We could solve this with a 2-pass approach, where the // first pass would build an FSA (no outputs) solely to // determine which prefixes are the 'leaves' in the // pruning. The 2nd pass would then look at this prefix // trie to mark the seek points and build the FST mapping // to the true output. // // But, one downside to this approach is that it'd result // in uneven index term selection. EG with prune1=10, the // resulting index terms could be as frequent as every 10 // terms or as rare as every <maxArcCount> * 10 (eg 2560), // in the extremes. public VariableGapTermsIndexWriter(SegmentWriteState state, IndexTermSelector policy) { string indexFileName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, TERMS_INDEX_EXTENSION); m_output = state.Directory.CreateOutput(indexFileName, state.Context); bool success = false; try { //fieldInfos = state.FieldInfos; // LUCENENET: Not used this.policy = policy; WriteHeader(m_output); success = true; } finally { if (!success) { IOUtils.DisposeWhileHandlingException(m_output); } } }
// TODO: it'd be nice to let the FST builder prune based // on term count of each node (the prune1/prune2 that it // accepts), and build the index based on that. This // should result in a more compact terms index, more like // a prefix trie than the other selectors, because it // only stores enough leading bytes to get down to N // terms that may complete that prefix. It becomes // "deeper" when terms are dense, and "shallow" when they // are less dense. // // However, it's not easy to make that work this this // API, because that pruning doesn't immediately know on // seeing each term whether that term will be a seek point // or not. It requires some non-causality in the API, ie // only on seeing some number of future terms will the // builder decide which past terms are seek points. // Somehow the API'd need to be able to return a "I don't // know" value, eg like a Future, which only later on is // flipped (frozen) to true or false. // // We could solve this with a 2-pass approach, where the // first pass would build an FSA (no outputs) solely to // determine which prefixes are the 'leaves' in the // pruning. The 2nd pass would then look at this prefix // trie to mark the seek points and build the FST mapping // to the true output. // // But, one downside to this approach is that it'd result // in uneven index term selection. EG with prune1=10, the // resulting index terms could be as frequent as every 10 // terms or as rare as every <maxArcCount> * 10 (eg 2560), // in the extremes. public VariableGapTermsIndexWriter(SegmentWriteState state, IndexTermSelector policy) { string indexFileName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, TERMS_INDEX_EXTENSION); Output = state.Directory.CreateOutput(indexFileName, state.Context); bool success = false; try { _policy = policy; WriteHeader(Output); success = true; } finally { if (!success) IOUtils.CloseWhileHandlingException(Output); } }