private static (string body, string additionalData) ParseCommitBody([NotNull] StringLineReader reader, [NotNull] string subject) { int lengthOfSubjectRepeatedInBody = subject.Length + 2 /*newlines*/; if (reader.Remaining == lengthOfSubjectRepeatedInBody + EndOfBody.Length) { return(body : subject, additionalData : null); } string tail = reader.ReadToEnd() ?? ""; int indexOfEndOfBody = tail.LastIndexOf(EndOfBody, StringComparison.InvariantCulture); if (indexOfEndOfBody < 0) { // TODO log this parse error Debug.Fail("Missing end-of-body marker in the log -- this should not happen"); return(body : null, additionalData : null); } string additionalData = null; if (tail.Length > indexOfEndOfBody + EndOfBody.Length) { additionalData = tail.Substring(indexOfEndOfBody + EndOfBody.Length).TrimStart(); } string body = indexOfEndOfBody == lengthOfSubjectRepeatedInBody ? subject : tail.Substring(0, indexOfEndOfBody).TrimEnd(); return(body, additionalData); }
private static bool TryParseRevision(GitModule module, ArraySegment <byte> chunk, StringPool stringPool, Encoding logOutputEncoding, out GitRevision revision) { // The 'chunk' of data contains a complete git log item, encoded. // This method decodes that chunk and produces a revision object. // All values which can be read directly from the byte array are arranged // at the beginning of the chunk. The latter part of the chunk will require // decoding as a string. if (chunk.Count == 0) { // "git log -z --name-only" returns multiple consecutive null bytes when logging // the history of a single file. Haven't worked out why, but it's safe to skip // such chunks. revision = default; return(false); } #region Object ID, Tree ID, Parent IDs // The first 40 bytes are the revision ID and the tree ID back to back if (!ObjectId.TryParseAsciiHexBytes(chunk, 0, out var objectId) || !ObjectId.TryParseAsciiHexBytes(chunk, ObjectId.Sha1CharCount, out var treeId)) { revision = default; return(false); } var array = chunk.Array; var offset = chunk.Offset + (ObjectId.Sha1CharCount * 2); var lastOffset = chunk.Offset + chunk.Count; // Next we have zero or more parent IDs separated by ' ' and terminated by '\n' var parentIds = new ObjectId[CountParents(offset)]; var parentIndex = 0; int CountParents(int baseOffset) { if (array[baseOffset] == '\n') { return(0); } var count = 1; while (true) { baseOffset += ObjectId.Sha1CharCount; var c = array[baseOffset]; if (c != ' ') { break; } count++; baseOffset++; } return(count); } while (true) { if (offset >= lastOffset - ObjectId.Sha1CharCount - 1) { revision = default; return(false); } var b = array[offset]; if (b == '\n') { // There are no more parent IDs offset++; break; } if (b == ' ') { // We are starting a new parent ID offset++; } if (!ObjectId.TryParseAsciiHexBytes(array, offset, out var parentId)) { // TODO log this parse problem revision = default; return(false); } parentIds[parentIndex++] = parentId; offset += ObjectId.Sha1CharCount; } #endregion #region Timestamps // Lines 2 and 3 are timestamps, as decimal ASCII seconds since the unix epoch, each terminated by `\n` var authorDate = ParseUnixDateTime(); var commitDate = ParseUnixDateTime(); DateTime ParseUnixDateTime() { long unixTime = 0; while (true) { var c = array[offset++]; if (c == '\n') { return(DateTimeUtils.UnixEpoch.AddTicks(unixTime * TimeSpan.TicksPerSecond).ToLocalTime()); } unixTime = (unixTime * 10) + (c - '0'); } } #endregion #region Encoding // Line is the name of the encoding used by git, or an empty string, terminated by `\n` string encodingName; Encoding encoding; var encodingNameEndOffset = Array.IndexOf(array, (byte)'\n', offset); if (encodingNameEndOffset == -1) { // TODO log this error case revision = default; return(false); } if (offset == encodingNameEndOffset) { // No encoding specified encoding = logOutputEncoding; encodingName = null; } else { encodingName = logOutputEncoding.GetString(array, offset, encodingNameEndOffset - offset); encoding = module.GetEncodingByGitName(encodingName); } offset = encodingNameEndOffset + 1; #endregion #region Encoded string values (names, emails, subject, body, name) // Finally, decode the names, email, subject and body strings using the required text encoding var s = encoding.GetString(array, offset, lastOffset - offset); var reader = new StringLineReader(s); var author = reader.ReadLine(stringPool); var authorEmail = reader.ReadLine(stringPool); var committer = reader.ReadLine(stringPool); var committerEmail = reader.ReadLine(stringPool); var subject = reader.ReadLine(advance: false); if (author == null || authorEmail == null || committer == null || committerEmail == null || subject == null) { // TODO log this parse error Debug.Fail("Unable to read an entry from the log -- this should not happen"); revision = default; return(false); } // NOTE the convention is that the Subject string is duplicated at the start of the Body string // Therefore we read the subject twice. // If there are not enough characters remaining for a body, then just assign the subject string directly. var body = reader.Remaining - subject.Length == 2 ? subject : reader.ReadToEnd(); if (body == null) { // TODO log this parse error Debug.Fail("Unable to read body from the log -- this should not happen"); revision = default; return(false); } var indexOfEndOfBody = body.LastIndexOf(EndOfBody, StringComparison.InvariantCulture); string additionalData = null; var bodyContainsAdditionalData = body.Length > indexOfEndOfBody + EndOfBody.Length; if (bodyContainsAdditionalData) { additionalData = body.Substring(indexOfEndOfBody + EndOfBody.Length).TrimStart(); } body = body.Substring(0, indexOfEndOfBody); #endregion revision = new GitRevision(objectId) { ParentIds = parentIds, TreeGuid = treeId, Author = author, AuthorEmail = authorEmail, AuthorDate = authorDate, Committer = committer, CommitterEmail = committerEmail, CommitDate = commitDate, MessageEncoding = encodingName, Subject = subject, Body = body, Name = additionalData, HasMultiLineMessage = !ReferenceEquals(subject, body), HasNotes = false }; return(true); }
private static bool TryParseRevision(GitModule module, ArraySegment <byte> chunk, StringPool stringPool, Encoding logOutputEncoding, out GitRevision revision) { // The 'chunk' of data contains a complete git log item, encoded. // This method decodes that chunk and produces a revision object. // All values which can be read directly from the byte array are arranged // at the beginning of the chunk. The latter part of the chunk will require // decoding as a string. #region Object ID, Tree ID, Parent IDs // The first 40 bytes are the revision ID and the tree ID back to back if (!ObjectId.TryParseAsciiHexBytes(chunk, 0, out var objectId) || !ObjectId.TryParseAsciiHexBytes(chunk, ObjectId.Sha1CharCount, out var treeId)) { revision = default; return(false); } var objectIdStr = objectId.ToString(); var array = chunk.Array; var offset = chunk.Offset + (ObjectId.Sha1CharCount * 2); var lastOffset = chunk.Offset + chunk.Count; // Next we have zero or more parent IDs separated by ' ' and terminated by '\n' var parentIds = new List <ObjectId>(capacity: 1); while (true) { if (offset >= lastOffset - 21) { revision = default; return(false); } var b = array[offset]; if (b == '\n') { // There are no more parent IDs offset++; break; } if (b == ' ') { // We are starting a new parent ID offset++; } if (!ObjectId.TryParseAsciiHexBytes(array, offset, out var parentId)) { // TODO log this parse problem revision = default; return(false); } parentIds.Add(parentId); offset += ObjectId.Sha1CharCount; } #endregion #region Timestamps // Lines 2 and 3 are timestamps, as decimal ASCII seconds since the unix epoch, each terminated by `\n` var authorDate = ParseUnixDateTime(); var commitDate = ParseUnixDateTime(); DateTime ParseUnixDateTime() { long unixTime = 0; while (true) { var c = array[offset++]; if (c == '\n') { return(DateTimeUtils.UnixEpoch.AddTicks(unixTime * TimeSpan.TicksPerSecond).ToLocalTime()); } unixTime = (unixTime * 10) + (c - '0'); } } #endregion #region Encoding // Line is the name of the encoding used by git, or an empty string, terminated by `\n` string encodingName; Encoding encoding; var encodingNameEndOffset = Array.IndexOf(array, (byte)'\n', offset); if (encodingNameEndOffset == -1) { // TODO log this error case revision = default; return(false); } if (offset == encodingNameEndOffset) { // No encoding specified encoding = logOutputEncoding; encodingName = null; } else { encodingName = logOutputEncoding.GetString(array, offset, encodingNameEndOffset - offset); encoding = module.GetEncodingByGitName(encodingName); } offset = encodingNameEndOffset + 1; #endregion #region Encoded string valies (names, emails, subject, body) // Finally, decode the names, email, subject and body strings using the required text encoding var s = encoding.GetString(array, offset, lastOffset - offset); var reader = new StringLineReader(s); var author = reader.ReadLine(stringPool); var authorEmail = reader.ReadLine(stringPool); var committer = reader.ReadLine(stringPool); var committerEmail = reader.ReadLine(stringPool); // NOTE the convention is that the Body property contain a copy of Subject // Therefore we read the subject twice var subject = reader.ReadLine(advance: false); var body = reader.ReadToEnd(); if (author == null || authorEmail == null || committer == null || committerEmail == null || subject == null || body == null) { // TODO log this parse error Debug.Fail("Unable to read an entry from the log -- this should not happen"); revision = default; return(false); } #endregion revision = new GitRevision(null) { // TODO are we really sure we can't make Revision.Guid an ObjectId? Guid = objectIdStr, // TODO take IReadOnlyList<ObjectId> instead ParentGuids = parentIds.ToArray(p => p.ToString()), TreeGuid = treeId, Author = author, AuthorEmail = authorEmail, AuthorDate = authorDate, Committer = committer, CommitterEmail = committerEmail, CommitDate = commitDate, MessageEncoding = encodingName, Subject = subject, Body = body, HasMultiLineMessage = !string.IsNullOrWhiteSpace(body) }; return(true); }