/// <summary> /// Analyzes a range of bytes, looking for opportunities to promote uncategorized /// data to a more structured form. /// </summary> /// <param name="start">Offset of first byte in range.</param> /// <param name="end">Offset of last byte in range.</param> private void AnalyzeRange(int start, int end) { // TODO(someday): consider copying the buffer into a string and using Regex. This // can be done fairly quickly with "unsafe" code, e.g.: // https://stackoverflow.com/questions/3028768/net-regular-expressions-on-bytes-instead-of-chars // Could be useful for ASCII stuff and the repeated-byte detector, e.g.: // https://stackoverflow.com/questions/1660694/regular-expression-to-match-any-character-being-repeated-more-than-10-times mDebugLog.LogI("Analyzing +" + start.ToString("x6") + " - +" + end.ToString("x6")); int minStringChars = mAnalysisParams.MinCharsForString; bool doAnalysis = mAnalysisParams.AnalyzeUncategorizedData; FormatDescriptor oneByteDefault = FormatDescriptor.Create(1, FormatDescriptor.Type.Default, FormatDescriptor.SubType.None); FormatDescriptor.DebugPrefabBump(-1); while (start <= end) { if (!doAnalysis) { // Analysis is disabled, so just mark everything as single-byte data. mAnattribs[start].DataDescriptor = oneByteDefault; FormatDescriptor.DebugPrefabBump(); start++; continue; } // Check for block of repeated values. int length = RecognizeRun(mFileData, start, end); bool isAscii = TextUtil.IsPrintableAscii((char)(mFileData[start] & 0x7f)); if (length >= MIN_RUN_LENGTH) { // Output as run or ASCII string. Prefer ASCII if the string is short // enough to fit on one line (e.g. 64 chars including delimiters) and // meets the minimum string length threshold. if (isAscii && length <= MIN_RUN_LENGTH_ASCII && length >= minStringChars) { // string -- if we create the descriptor here, we save a little time, // but strings like "*****hello" turn into two separate strings. //LogV(start, "String from run of '" + (char)(mFileData[start] & 0x7f) + // "': " + length + " bytes"); //mAnattribs[start].DataDescriptor = FormatDescriptor.CreateDescriptor( // length, FormatDescriptor.Type.String, // FormatDescriptor.SubType.None); //start += length; //continue; } else { // run LogV(start, "Run of 0x" + mFileData[start].ToString("x2") + ": " + length + " bytes"); mAnattribs[start].DataDescriptor = FormatDescriptor.Create( length, FormatDescriptor.Type.Fill, FormatDescriptor.SubType.None); start += length; continue; } } length = RecognizeAscii(mFileData, start, end); if (length >= minStringChars) { LogV(start, "ASCII string, len=" + length + " bytes"); mAnattribs[start].DataDescriptor = FormatDescriptor.Create(length, FormatDescriptor.Type.String, FormatDescriptor.SubType.None); start += length; continue; } // Nothing found, output as single byte. This is the easiest form for users // to edit. mAnattribs[start].DataDescriptor = oneByteDefault; FormatDescriptor.DebugPrefabBump(); // It's tempting to advance by the "length" result from RecognizeRun, and if // we were just looking for runs of identical bytes we could. However, that // would lose short ASCII strings that began with repeated bytes, e.g. "---%". start++; } }
/// <summary> /// Analyzes uncategorized regions of the file to see if they fit common patterns. /// /// This is re-run after most changes to the project, so we don't want to do anything /// crazily expensive. /// </summary> /// <returns>True on success.</returns> public void AnalyzeUncategorized() { // TODO(someday): we can make this faster. The data doesn't change, so we // only need to do a full scan once, when the file is first loaded. We can // create a TypedRangeSet for runs of identical bytes, using the byte value // as the type. A second TypedRangeSet would identify runs of ASCII chars, // with different types for high/low ASCII (and PETSCII?). AnalyzeRange() would // then just need to find the intersection with the sets, which should be // significantly faster. We would need to re-do the scan if the parameters // for things like min match length change. FormatDescriptor oneByteDefault = FormatDescriptor.Create(1, FormatDescriptor.Type.Default, FormatDescriptor.SubType.None); FormatDescriptor.DebugPrefabBump(-1); // If it hasn't been identified as code or data, set the "data" flag to // give it a positive identification as data. (This should be the only // place outside of CodeAnalysis that sets this flag.) This isn't strictly // necessary, but it helps us assert things when pieces start moving around. for (int offset = 0; offset < mAnattribs.Length; offset++) { Anattrib attr = mAnattribs[offset]; if (attr.IsInlineData) { // While we're here, add a default format descriptor for inline data // that doesn't have one. We don't try to analyze it otherwise. if (attr.DataDescriptor == null) { mAnattribs[offset].DataDescriptor = oneByteDefault; FormatDescriptor.DebugPrefabBump(); } } else if (!attr.IsInstruction) { mAnattribs[offset].IsData = true; } } mDebugLog.LogI("Analyzing uncategorized data..."); int startOffset = -1; for (int offset = 0; offset < mAnattribs.Length;) { // We want to find a contiguous series of offsets which are not known // to hold code or data. We stop if we encounter a user-defined label // or format descriptor. Anattrib attr = mAnattribs[offset]; if (attr.IsInstruction || attr.IsInlineData || attr.IsDataStart) { // Instruction, inline data, or formatted data known to be here. Analyze // previous chunk, then advance past this. if (startOffset >= 0) { AnalyzeRange(startOffset, offset - 1); startOffset = -1; } if (attr.IsInstruction) { // Because of embedded instructions, we can't simply leap forward. offset++; } else { Debug.Assert(attr.Length > 0); offset += attr.Length; } } else if (attr.Symbol != null || mProject.HasCommentOrNote(offset)) { // In an uncategorized area, but we want to break at this byte // so the user or auto label doesn't get buried in the middle of // a large chunk. // // This is similar to, but independent of, GroupedOffsetSetFromSelected() // in ProjectView. This is for auto-detection, the other is for user // selection. It's best if the two behave similarly though. if (startOffset >= 0) { AnalyzeRange(startOffset, offset - 1); } startOffset = offset; offset++; } else { // This offset is uncategorized, keep gathering. if (startOffset < 0) { startOffset = offset; } offset++; // Check to see if the address has changed from the previous entry. if (offset < mAnattribs.Length && mAnattribs[offset - 1].Address + 1 != mAnattribs[offset].Address) { // Must be an ORG here. Scan previous region. AnalyzeRange(startOffset, offset - 1); startOffset = -1; } } } if (startOffset >= 0) { AnalyzeRange(startOffset, mAnattribs.Length - 1); } }