/// <summary>Does the donkey work of searching for a pattern. /// Returns the byte address of the first match.</summary> private static void DoFind(Pattern pat, long start, bool backward, BLIData d, Func <RangeI, bool> on_found) { using (d.file) { var line = new Line(); AddLineFunc test_line = (line_rng, baddr, fend, bf, enc) => { // Ignore blanks? if (line_rng.Empty && d.ignore_blanks) { return(true); } // Parse the line from the buffer line.Read(baddr + line_rng.Beg, bf, (int)line_rng.Beg, (int)line_rng.Size, d.encoding, d.col_delim, null, d.transforms); // Keep searching while the text is filtered out or doesn't match the pattern if (!PassesFilters(line.RowText, d.filters) || !pat.IsMatch(line.RowText)) { return(true); } // Found a match return(on_found(new RangeI(baddr + line_rng.Beg, baddr + line_rng.End))); }; // Search for files var line_buf = new byte[d.max_line_length]; long count = backward ? start - 0 : d.fileend - start; FindLines(d.file, start, d.fileend, backward, count, test_line, d.encoding, d.row_delim, line_buf, d.progress); } }
/// <summary>Export 'filepath' to 'outp'.</summary> /// <param name="d">A copy of the data needed to do the export</param> /// <param name="ranges">Byte ranges within the input file to export</param> /// <param name="row_delimiter">The row delimiter to use in the output file (robitised)</param> /// <param name="col_delimiter">The column delimiter to use in the output file (robitised)</param> /// <param name="outp">The output stream to write the exported result to</param> private static void DoExport(BLIData d, IEnumerable <RangeI> ranges, string row_delimiter, string col_delimiter, StreamWriter outp) { var line = new Line(); // Call back for adding lines to the export result AddLineFunc add_line = (line_rng, baddr, fend, bf, enc) => { if (line_rng.Empty && d.ignore_blanks) { return(true); } // Parse the line from the buffer line.Read(baddr + line_rng.Beg, bf, (int)line_rng.Beg, (int)line_rng.Size, d.encoding, d.col_delim, null, d.transforms); // Keep searching while the text is filtered out or doesn't match the pattern if (!PassesFilters(line.RowText, d.filters)) { return(true); } // Write to the output file outp.Write(string.Join(col_delimiter, line.Column)); outp.Write(row_delimiter); return(true); }; byte[] buf = new byte[d.max_line_length]; foreach (var rng in ranges) { // Find the start of a line (grow the range if necessary) var r = new RangeI(Math_.Clamp(rng.Beg, 0, d.file.Stream.Length), Math_.Clamp(rng.End, 0, d.file.Stream.Length)); r.Beg = FindLineStart(d.file, r.Beg, r.End, d.row_delim, d.encoding, buf); // Read lines and write them to the export file FindLines(d.file, r.Beg, r.End, false, r.Size, add_line, d.encoding, d.row_delim, buf, d.progress); } }
/// <summary>Scan the file from 'filepos' adding whole lines to 'line_index' until 'length' bytes have been read or 'add_line' returns false. Returns true if not interrupted</summary> private static bool FindLines(Stream src, long filepos, long fileend, bool backward, long length, AddLineFunc add_line, Encoding encoding, byte[] line_end, byte[] buf, ProgressFunc progress) { long scanned = 0, read_addr = filepos; for (; ;) { // Progress update if (!progress(scanned, length)) { return(false); } // Seek to the start position src.Seek(read_addr, SeekOrigin.Begin); // Buffer the contents of the file in 'buf'. var remaining = length - scanned; var read = Buffer(src, remaining, fileend, encoding, backward, buf, out var eof); if (read == 0) { break; } // Set iterator limits. // 'i' is where to start scanning from // 'iend' is the end of the range to scan // 'ilast' is the start of the last line found // 'base_addr' is the file offset from which buf was read var i = backward ? read - 1 : 0; var iend = backward ? -1 : read; var lasti = backward ? read : 0; var base_addr = backward ? src.Position : src.Position - read; // If we're searching backwards and 'i' is at the end of a line, // we don't want to count that as the first found line so adjust 'i'. // If not however, then 'i' is partway through a line or at the end // of a file without a 'line_end' at the end and we want to include // this (possibly partial) line. if (backward && IsLineEnd(buf, read - line_end.Length, line_end)) { i -= line_end.Length; } // Scan the buffer for lines for (i = Tools.FindNextDelim(buf, i, read, line_end, backward); i != iend; i = Tools.FindNextDelim(buf, i, read, line_end, backward)) { // 'i' points to the start of a line, // 'lasti' points to the start of the last line we found // Get the range in buf containing the line var line = backward ? new RangeI(i, lasti - line_end.Length) : new RangeI(lasti, i - line_end.Length); // Pass the detected line to the callback if (!add_line(line, base_addr, fileend, buf, encoding)) { return(false); } lasti = i; if (backward) { i -= line_end.Length + 1; } } // From 'lasti' to the end (or start in the backwards case) of the buffer represents // a (possibly partial) line. If we read a full buffer load last time, then we'll go // round again trying to read another buffer load, starting from 'lasti'. if (read == buf.Length) { // Make sure we're always making progress var scan_increment = backward ? (read - lasti) : lasti; if (scan_increment == 0) // No lines detected in this block { throw new NoLinesException(read); } scanned += scan_increment; read_addr = filepos + (backward ? -scanned : +scanned); } // Otherwise, we've read to the end (or start) of the file, or to the limit 'length'. // What's left in the buffer may be a partial line. else { // 'i' points to 'iend', // 'lasti' points to the start of the last line we found // Get the range in buf containing the line var line = backward ? new RangeI(i + 1, lasti - line_end.Length) : new RangeI(lasti, i - (IsLineEnd(buf, i - line_end.Length, line_end) ? line_end.Length : 0)); // Pass the detected line to the callback if (!add_line(line, base_addr, fileend, buf, encoding)) { return(false); } break; } } return(true); }
/// <summary>Scan the file from 'filepos' adding whole lines to 'line_index' until 'length' bytes have been read or 'add_line' returns false</summary> /// <param name="file">The file to scan</param> /// <param name="filepos">The position in the file to start scanning from</param> /// <param name="fileend">The current known length of the file</param> /// <param name="backward">The direction to scan</param> /// <param name="length">The number of bytes to scan over</param> /// <param name="add_line">Callback function called with each detected line</param> /// <param name="encoding">The text file encoding</param> /// <param name="row_delim">The bytes that identify an end of line</param> /// <param name="buf">A buffer to use when buffering file data</param> /// <param name="progress">Callback function to report progress and allow the find to abort</param> private static void FindLines(IFileSource file, long filepos, long fileend, bool backward, long length, AddLineFunc add_line, Encoding encoding, byte[] row_delim, byte[] buf, ProgressFunc progress) { long scanned = 0, read_addr = filepos; for (;;) { // Progress update if (progress != null && !progress(scanned, length)) { return; } // Seek to the start position file.Stream.Seek(read_addr, SeekOrigin.Begin); // Buffer the contents of the file in 'buf'. long remaining = length - scanned; bool eof; int read = Buffer(file, remaining, fileend, encoding, backward, buf, out eof); if (read == 0) { break; } // Set iterator limits. // 'i' is where to start scanning from // 'iend' is the end of the range to scan // 'ilast' is the start of the last line found // 'base_addr' is the file offset from which buf was read int i = backward ? read - 1 : 0; int iend = backward ? -1 : read; int lasti = backward ? read : 0; long base_addr = backward ? file.Stream.Position : file.Stream.Position - read; // If we're searching backwards and 'i' is at the end of a line, // we don't want to count that as the first found line so adjust 'i'. // If not however, then 'i' is partway through a line or at the end // of a file without a row delimiter at the end and we want to include // this (possibly partial) line. if (backward && IsRowDelim(buf, read - row_delim.Length, row_delim)) { i -= row_delim.Length; } // Scan the buffer for lines for (i = Misc.FindNextDelim(buf, i, read, row_delim, backward); i != iend; i = Misc.FindNextDelim(buf, i, read, row_delim, backward)) { // 'i' points to the start of a line, // 'lasti' points to the start of the last line we found // Get the range in buf containing the line RangeI line = backward ? new RangeI(i, lasti - row_delim.Length) : new RangeI(lasti, i - row_delim.Length); // Pass the detected line to the callback if (!add_line(line, base_addr, fileend, buf, encoding)) { return; } lasti = i; if (backward) { i -= row_delim.Length + 1; } } // From 'lasti' to the end (or start in the backwards case) of the buffer represents // a (possibly partial) line. If we read a full buffer load last time, then we'll go // round again trying to read another buffer load, starting from 'lasti'. if (read == buf.Length) { // Make sure we're always making progress long scan_increment = backward ? (read - lasti) : lasti; if (scan_increment == 0) // No lines detected in this block { throw new NoLinesException(read); } scanned += scan_increment; read_addr = filepos + (backward ? -scanned : +scanned); } // Otherwise, we're read to the end (or start) of the file, or to the limit 'length'. // What's left in the buffer may be a partial line. else { // 'i' points to 'iend', // 'lasti' points to the start of the last line we found // Get the range in buf containing the line RangeI line = backward ? new RangeI(i + 1, lasti - row_delim.Length) : new RangeI(lasti, i - (IsRowDelim(buf, i - row_delim.Length, row_delim) ? row_delim.Length : 0)); // ReSharper disable RedundantJumpStatement // Pass the detected line to the callback if (!add_line(line, base_addr, fileend, buf, encoding)) { return; } // ReSharper restore RedundantJumpStatement break; } } }
/// <summary>The grunt work of building the new line index.</summary> private static void BuildLineIndexAsync(BLIData d, Action <BLIData, RangeI, List <RangeI>, Exception> on_complete) { // This method runs in a background thread // All we're doing here is loading data around 'd.filepos' so that there are an equal number // of lines on either side. This can be optimised however because the existing range of // cached data probably overlaps the range we want loaded. try { Log.Write(ELogLevel.Info, "BLIAsync", $"build started. (id {d.build_issue}, reload {d.reload})"); if (BuildCancelled(d.build_issue)) { return; } using (d.file) { // A temporary buffer for reading sections of the file var buf = new byte[d.max_line_length]; // Seek to the first line that starts immediately before 'filepos' d.filepos = FindLineStart(d.file, d.filepos, d.fileend, d.row_delim, d.encoding, buf); if (BuildCancelled(d.build_issue)) { return; } // Determine the range to scan and the number of lines in each direction var scan_backward = (d.fileend - d.filepos) > (d.filepos - 0); // scan in the most bound direction first var scan_range = CalcBufferRange(d.filepos, d.fileend, d.file_buffer_size); var line_range = CalcLineRange(d.line_cache_count); var bwd_lines = line_range.Begi; var fwd_lines = line_range.Endi; // Incremental loading - only load what isn't already cached. // If the 'filepos' is left of the cache centre, try to extent in left direction first. // If the scan range in that direction is empty, try extending at the other end. The // aim is to try to get d.line_index_count as close to d.line_cache_count as possible // without loading data that is already cached. #region Incremental loading if (!d.reload && !d.cached_whole_line_range.Empty) { // Determine the direction the cached range is moving based on where 'filepos' is relative // to the current cache centre and which range contains an valid area to be scanned. // With incremental scans we can only update one side of the cache because the returned line index has to // be a contiguous block of lines. This means one of 'bwd_lines' or 'fwd_lines' must be zero. var Lrange = new RangeI(scan_range.Beg, d.cached_whole_line_range.Beg); var Rrange = new RangeI(d.cached_whole_line_range.End, scan_range.End); var dir = (!Lrange.Empty && !Rrange.Empty) ? Math.Sign(2 * d.filepos_line_index - d.line_cache_count) : (!Lrange.Empty) ? -1 : (!Rrange.Empty) ? +1 : 0; // Determine the number of lines to scan, based on direction if (dir < 0) { scan_backward = true; scan_range = Lrange; bwd_lines -= Math_.Clamp(d.filepos_line_index - 0, 0, bwd_lines); fwd_lines = 0; } else if (dir > 0) { scan_backward = false; scan_range = Rrange; bwd_lines = 0; fwd_lines -= Math_.Clamp(d.line_index_count - d.filepos_line_index - 1, 0, fwd_lines); } else if (dir == 0) { bwd_lines = 0; fwd_lines = 0; scan_range = RangeI.Zero; } } #endregion Debug.Assert(bwd_lines + fwd_lines <= d.line_cache_count); // Build the collection of line byte ranges to add to the cache var line_index = new List <RangeI>(); if (bwd_lines != 0 || fwd_lines != 0) { // Line index buffers for collecting the results var fwd_line_buf = new List <RangeI>(); var bwd_line_buf = new List <RangeI>(); // Data used in the 'add_line' callback. Updated for forward and backward passes var lbd = new LineBufferData { line_buf = null, // pointer to either 'fwd_line_buf' or 'bwd_line_buf' line_limit = 0, // Caps the number of lines read for each of the forward and backward searches }; // Callback for adding line byte ranges to a line buffer AddLineFunc add_line = (line, baddr, fend, bf, enc) => { if (line.Empty && d.ignore_blanks) { return(true); } // Test 'text' against each filter to see if it's included // Note: not caching this string because we want to read immediate data // from the file to pick up file changes. string text = d.encoding.GetString(buf, (int)line.Beg, (int)line.Size); if (!PassesFilters(text, d.filters)) { return(true); } // Convert the byte range to a file range line = line.Shift(baddr); Debug.Assert(new RangeI(0, d.fileend).Contains(line)); lbd.line_buf.Add(line); Debug.Assert(lbd.line_buf.Count <= lbd.line_limit); return((fwd_line_buf.Count + bwd_line_buf.Count) < lbd.line_limit); }; // Callback for updating progress ProgressFunc progress = (scanned, length) => { int numer = fwd_line_buf.Count + bwd_line_buf.Count, denom = lbd.line_limit; return(d.progress(numer, denom) && !BuildCancelled(d.build_issue)); }; // Scan twice, starting in the direction of the smallest range so that any // unused cache space is used by the search in the other direction var scan_from = Math_.Clamp(d.filepos, scan_range.Beg, scan_range.End); for (int a = 0; a != 2; ++a, scan_backward = !scan_backward) { if (BuildCancelled(d.build_issue)) { return; } lbd.line_buf = scan_backward ? bwd_line_buf : fwd_line_buf; lbd.line_limit += scan_backward ? bwd_lines : fwd_lines; if ((bwd_line_buf.Count + fwd_line_buf.Count) < lbd.line_limit) { var length = scan_backward ? scan_from - scan_range.Beg : scan_range.End - scan_from; FindLines(d.file, scan_from, d.fileend, scan_backward, length, add_line, d.encoding, d.row_delim, buf, progress); } } // Scanning backward adds lines to the line index in reverse order. bwd_line_buf.Reverse(); // 'line_index' should be a contiguous block of byte offset ranges for // the lines around 'd.filepos'. If 'd.reload' is false, then the line // index will only contain byte offset ranges that are not currently cached. line_index.Capacity = bwd_line_buf.Count + fwd_line_buf.Count; line_index.AddRange(bwd_line_buf); line_index.AddRange(fwd_line_buf); } // Job done on_complete(d, scan_range, line_index, null); } } catch (Exception ex) { on_complete(d, RangeI.Zero, null, ex); } }