/// <summary> /// 文本相似度比较 /// </summary> /// <param name="baseVersion"></param> /// <param name="newVersion"></param> /// <returns>返回值为 (baseVersion.length + newVersion.length - different.length)/(baseVersion.length + newVersion.length)</returns> public static decimal Similar(string baseVersion, string newVersion) { if (baseVersion == newVersion) { return(1); } if (string.IsNullOrEmpty(baseVersion) || string.IsNullOrEmpty(newVersion)) { return(0); } int diffLength = 0; DiffResult result = DiffResult.Diff(baseVersion, newVersion, 1); List <ChunkResult> chunks = result.ChunkList; foreach (ChunkResult chunk in chunks) { if (!chunk.sameChunk) { if (chunk.xString.Length < 1) { diffLength += DiffResult.CopyShaow(chunk.yString).Length; diffLength += chunk.yString.Length; } else if (chunk.yString.Length < 1) { diffLength += chunk.xString.Length; diffLength += DiffResult.CopyShaow(chunk.xString).Length; } else { diffLength += chunk.xString.Length; diffLength += chunk.yString.Length; } } } return(Decimal.Divide(baseVersion.Length + newVersion.Length - diffLength, baseVersion.Length + newVersion.Length)); }
/// <summary> /// 对比两个版本的字符串 /// </summary> /// <param name="baseVersion">旧版</param> /// <param name="newVersion">新版</param> /// <param name="minSegments">最小相同区间偏移量(取值范围 (1-minStringLength))</param> /// <returns>对比结果对象</returns> public static unsafe DiffResult Diff(string baseVersion, string newVersion, int minSegments) { //比较文件 fixed(char *xptr = baseVersion) { fixed(char *yptr = newVersion) { //int minSegments = 100; //x,y 对应的字节流长度 int xlen = baseVersion.Length; int ylen = newVersion.Length; //文档最后一次相同的地方 int xdiff = -1, ydiff = -1; //遍历开始,和结束 int xi = 0, yi = 0; //回溯索引 int byi, bxi; //交叉缓冲区索引 int xbuf, ybuf; //x,y 字节流对应的字节码 char xbyte, ybyte, inBackByte = '\0'; SameChunk root = null, prev = null, curr; /* * * 不匹配缓冲区,用来存储不匹配字节索引 * 大小为:256*4 占用1K内存 * xbuffer[x不匹配字节]=x_index; * ybuffer[y不匹配字节]=y_index; */ int[] xbuffer = new int[UInt16.MaxValue + 1]; int[] ybuffer = new int[UInt16.MaxValue + 1]; //一些临时变量 int xtmp = 0, ytmp = 0, elen = 0; //下列变量标识当前状态是否为回溯 bool xInback = false; int xInbackTmp = 0; bool yInback = false; int yInBackTmp = 0; //在回溯失败重检有没有发现同项标识 bool find = false; //确保不会溢出 while (xi < xlen && yi < ylen) { xbyte = xptr[xi]; ybyte = yptr[yi]; LABEL_NEQ: if (xbyte != ybyte) { //记录缓冲区数据 bxi = xbuffer[xbyte]; byi = ybuffer[ybyte]; //交叉索引缓冲区数据,看是否可以回溯 xbuf = xbuffer[ybyte]; ybuf = ybuffer[xbyte]; //检查两个缓冲区,如果都存在,回溯最小的对象 if (xbuf > xdiff) { if (ybuf > ydiff) { if (ybuf < xbuf) { goto LABLE_BACK_Y; } } goto LABLE_BACK_X; } if (ybuf > ydiff) { goto LABLE_BACK_Y; } goto LABLE_REC_BUF; LABLE_BACK_Y: //开始按XByte回溯Y yInBackTmp = yi; xInbackTmp = xi; yi = ybuffer[xbyte]; yInback = true; inBackByte = xbyte; //避免覆盖重复序列 if (bxi < xdiff) { xbuffer[xbyte] = xi; } if (byi < ydiff) { ybuffer[ybyte] = yi; } goto LABLE_LPEQ; LABLE_BACK_X: //开始按YByte回溯X xInbackTmp = xi; yInBackTmp = yi; xi = xbuffer[ybyte]; inBackByte = ybyte; xInback = true; //避免覆盖重复序列 if (bxi < xdiff) { xbuffer[xbyte] = xi; } if (byi < ydiff) { ybuffer[ybyte] = yi; } goto LABLE_LPEQ; LABLE_REC_BUF: //避免覆盖重复序列 if (bxi < xdiff) { xbuffer[xbyte] = xi; } if (byi < ydiff) { ybuffer[ybyte] = yi; } goto LABLE_LPED; } //当前字节码相等 LABLE_LPEQ: //往上回溯匹配看看是不是有漏网的 xtmp = xi - 1; ytmp = yi - 1; while (xtmp > -1 && xtmp > xdiff && ytmp > -1 && ytmp > ydiff) { if (xptr[xtmp] != yptr[ytmp]) { break; } xtmp--; ytmp--; } xi = xtmp + 1; yi = ytmp + 1; //往下匹配 xtmp = xi + 1; ytmp = yi + 1; while (xtmp < xlen && ytmp < ylen) { xbyte = xptr[xtmp]; ybyte = yptr[ytmp]; if (xbyte != ybyte) { break; } xtmp++; ytmp++; } elen = xtmp - xi; if (elen > minSegments) { xdiff = xtmp - 1; ydiff = ytmp - 1; //切分相同项 curr = new SameChunk(); curr.xStartIndex = xi; curr.xEndIndex = xdiff; curr.yStartIndex = yi; curr.yEndIndex = ydiff; if (prev != null) { prev.next = curr; curr.prev = prev; prev = curr; } else { prev = curr; if (root == null) { root = prev; } } //InternalAddChuncks(xdoc, xi, xtmp); xi = xtmp; yi = ytmp; xInback = false; yInback = false; goto LABEL_NEQ; } else { //说明太短,不视为是相同区间(重复区间) if (xInback) { //回溯失败,尝试向下找同项覆盖当前失败项 while (xi < xInbackTmp) { if (xptr[xi] == inBackByte) { xbuffer[inBackByte] = xi; find = true; break; } xi++; } if (!find) { xbuffer[inBackByte] = 0; } find = false; xi = xInbackTmp; yi = yInBackTmp; xInback = false; goto LABLE_LPED; } if (yInback) { while (yi < yInBackTmp) { if (yptr[yi] == inBackByte) { ybuffer[inBackByte] = yi; find = true; break; } yi++; } if (!find) { ybuffer[inBackByte] = 0; } find = false; yi = yInBackTmp; xi = xInbackTmp; yInback = false; goto LABLE_LPED; } xi = xtmp; yi = ytmp; continue; } LABLE_LPED: xi++; yi++; } DiffResult diff = new DiffResult(); diff.baseVersion = baseVersion; diff.newVersion = newVersion; diff.chunksRoot = root; return(diff); } } }