Interested Article - Diff

Документация

Тесты

✔ Все тесты пройдены.

Название Ожидается Фактически
✔ test_fastStrDiffRanges_diff_long_text
✔ test_fastStrDiffRanges_diff_no_timeout_4096
✔ test_fastStrDiffRanges_diff_no_timeout_512
✔ test_fastStrDiffRanges_diff_short_in_long
✔ test_fastStrDiffRanges_diff_short_in_long_2diffs
✔ test_fastStrDiffRanges_diff_short_text
✔ test_strDiffRanges_diff_end
✔ test_strDiffRanges_diff_middle
✔ test_strDiffRanges_diff_short_in_long
✔ test_strDiffRanges_diff_start
✔ test_strDiffRanges_empty_str
✔ test_strLongestCommonRange_added
✔ test_strLongestCommonRange_common_end
✔ test_strLongestCommonRange_common_start
✔ test_strLongestCommonRange_different
✔ test_strLongestCommonRange_empty
✔ test_strLongestCommonRange_equal
✔ test_strLongestCommonRange_longExample
✔ test_strLongestCommonRange_removed
✔ test_strLongestCommonRange_singleSymbol
local p = {}

-- TODO: Провести эксперимент с получением таблицы из символов из строки, проверить
-- будет, ли это быстрее, чем сравнение подстрок. Если представление ustring именно UTF-8,
-- то работа с таблицей может быть в теории быстрее (нет необходимости каждый раз обходить
-- всю строку для вычисления позиции символа).
-- Альтернативно подумать над алгоритмом побайтового сравнения, это должно быть быстрее.

-- TODO: Проверить по производительности в strLongestCommonRange() смену большей
-- и меньшей строк местами.

-- TODO: Использовать find с указанием индекса начала поиска в strLongestCommonRange.

function p.strLongestCommonRange(s1, s2, minWindowSize)
	-- TODO: Начало и длина вместо диапазонов
	if not minWindowSize then
		minWindowSize = 0
	end
	local longestStart1 = 0
	local longestEnd1 = 0
	local longestStart2 = 0
	local longestEnd2 = 0
	local currStart = 1
	local currEnd = 1 + minWindowSize
	
	local s1Len = mw.ustring.len(s1)
	local s2Len = mw.ustring.len(s2)

	while currEnd <= s1Len do
		local found = false
		local subS1
		local subS2
		local i = 1

		subS1 = mw.ustring.sub(s1, currStart, currEnd)
		while i + (currEnd - currStart) <= s2Len do
			if not found then
				subS2 = mw.ustring.sub(s2, i, i + (currEnd - currStart))
				if subS1 == subS2 then
					found = true
					currEnd = currEnd + 1
					if currEnd > s1Len then
						break
					end
				else
					i = i + 1
				end
			else
				local currEnd2 = i + currEnd - currStart
				if mw.ustring.sub(s1, currEnd, currEnd) == mw.ustring.sub(s2, currEnd2, currEnd2) then
					currEnd = currEnd + 1
					if currEnd > s1Len then
						break
					end
				else
					if currEnd - 1 - currStart > s1Len / 2 then
						return { currStart, currEnd - 1 }, { i, i + currEnd - 1 - currStart }
					end
					subS1 = mw.ustring.sub(s1, currStart, currEnd)
					if currEnd - 1 - currStart > longestEnd1 - longestStart1 or longestStart1 == 0 then
						longestStart1 = currStart
						longestEnd1 = currEnd - 1
						longestStart2 = i
						longestEnd2 = i + (longestEnd1 - longestStart1)
					end
					found = false
					i = i + 1
				end
			end
		end
		if found then
			if currEnd - 1 - currStart > longestEnd1 - longestStart1 or longestStart1 == 0 then
				longestStart1 = currStart
				longestEnd1 = currEnd - 1
				longestStart2 = i
				longestEnd2 = i + (longestEnd1 - longestStart1)
			end
		end
		currStart = currStart + 1
		currEnd = currEnd + 1
	end

	if longestStart1 == 0 then
		return nil, nil
	end
	return { longestStart1, longestEnd1 }, { longestStart2, longestEnd2 }
end

function p.strCommonRangesApprox(s1, s2, limit, s1Offset, s2Offset, ranges)
	ranges = ranges or {}
	s1Offset = s1Offset or 0
	s2Offset = s2Offset or 0

 	if s1 == '' or s2 == '' then
 		return ranges
 	end

	local d1, d2 = p.strLongestCommonRange(s1, s2, limit or 0)
	if d1 == nil then
		return ranges
	end
	if limit and d1[2] - d1[1] <= limit and d2[2] - d2[1] <= limit then
		return ranges
	end

	if d1[1] > 1 then
		p.strCommonRangesApprox(
			mw.ustring.sub(s1, 1, d1[1] - 1),
			mw.ustring.sub(s2, 1, d2[1] - 1),
			limit,
			s1Offset,
			s2Offset,
			ranges)
	end

	d1[1] = d1[1] + s1Offset
	d1[2] = d1[2] + s1Offset

	d2[1] = d2[1] + s2Offset
	d2[2] = d2[2] + s2Offset
	
	table.insert(ranges, { d1, d2 })

	if d1[2] - s1Offset < mw.ustring.len(s1) and d2[2] - s2Offset < mw.ustring.len(s2) then
		p.strCommonRangesApprox(
			mw.ustring.sub(s1, d1[2] + 1, mw.ustring.len(s1)),
			mw.ustring.sub(s2, d2[2] + 1, mw.ustring.len(s2)),
			limit,
			d1[2], d2[2],
			ranges)
	end

	return ranges
end

function p.strCommonRanges(s1, s2, s1Offset, s2Offset, ranges)
	return p.strCommonRangesApprox(s1, s2, nil, s1Offset, s2Offset, ranges)
end

local function tryGetWholeDiff(s1, s2)
	local s1Set = (s1 and s1 ~= '')
	local s2Set = (s2 and s2 ~= '')
	if s1Set and not s2Set then
		return {
			{
				{ 1, mw.ustring.len(s1) },
				{ 1, 0 },
			},
		}
	elseif not s1Set and s2Set then
		return {
			{
				{ 1, 0 },
				{ 1, mw.ustring.len(s2) },
			},
		}
	elseif (not s1Set and not s2Set) or (s1 == s2) then
		return {}
	end

	return nil
end

local function strDiffRangesNonNil(s1, s2, limit)
	local commonSubstrings = p.strCommonRangesApprox(s1, s2, limit)
	if table.getn(commonSubstrings) == 0 then
		return {
			{
				{ 1, mw.ustring.len(s1) },
				{ 1, mw.ustring.len(s2) },
			},
		}
	end

	local diffs = {}

	local curr = commonSubstrings[1]
	local last1 = 1
	local last2 = 1
	if curr[1][1] - last1 > 0 or curr[2][1] - last2 > 0 then
		table.insert(diffs, {
			{ last1, curr[1][1] - last1 },
			{ last2, curr[2][1] - last2 },
		})
	end
	last1 = curr[1][2]
	last2 = curr[2][2]

	for i = 2, table.getn(commonSubstrings) do
		curr = commonSubstrings[i]
		if curr[1][1] - last1 > 0 and curr[2][1] - last2 > 0 then
			if curr[1][1] - last1 > 0 then
				last1 = last1 + 1
			end
			if curr[2][1] - last2 > 0 then
				last2 = last2 + 1
			end
			table.insert(diffs, {
				{ last1, curr[1][1] - last1 },
				{ last2, curr[2][1] - last2 },
			})
		end
		last1 = curr[1][2]
		last2 = curr[2][2]
	end

	if mw.ustring.len(s1) - last1 > 0 or mw.ustring.len(s2) - last2 > 0 then
		table.insert(diffs, {
			{ last1 + 1, mw.ustring.len(s1) - last1 },
			{ last2 + 1, mw.ustring.len(s2) - last2 },
		})
	end

	return diffs
end

function p.strDiffRanges(s1, s2, limit)
	local diffs = tryGetWholeDiff(s1, s2)
	if diffs then
		return diffs
	else
		diffs = {}
	end

	return strDiffRangesNonNil(s1, s2, limit)
end

function p.findCommonFromStart(s1, s2)
	if mw.ustring.sub(s1, 1, 1) ~= mw.ustring.sub(s2, 1, 1) then
		return nil
	end

	local s1Len = mw.ustring.len(s1)
	local s2Len = mw.ustring.len(s2)
	local sLen
	if s1Len < s2Len then
		sLen = s1Len
	else
		sLen = s2Len
	end
	local i = 1
	local d = math.floor(sLen / 10)
	if d < 1 then
		d = 1
	end
	while i <= sLen do
		local s1Start = mw.ustring.sub(s1, i, i + d)
		local s2Start = mw.ustring.sub(s2, i, i + d)
		if s1Start ~= s2Start then
			for j = 1, d do
				if mw.ustring.sub(s1Start, j, j) ~= mw.ustring.sub(s2Start, j, j) then
					return i + j - 1 - 1
				end
			end
		end
		i = i + d
	end

	return sLen
end

function p.findCommonFromEnd(s1, s2)
	local s1Len = mw.ustring.len(s1)
	local s2Len = mw.ustring.len(s2)
	if mw.ustring.sub(s1, s1Len, s1Len) ~= mw.ustring.sub(s2, s2Len, s2Len) then
		return nil
	end

	local sLen
	if s1Len < s2Len then
		sLen = s1Len
	else
		sLen = s2Len
	end
	local i = 0
	local d = math.floor(sLen / 10)
	if d < 1 then
		d = 1
	end
	while i <= sLen do
		local s1Start = mw.ustring.sub(s1, s1Len - (i + d), s1Len - i)
		local s2Start = mw.ustring.sub(s2, s2Len - (i + d), s2Len - i)
		if s1Start ~= s2Start then
			for j = d, 1, -1 do
				if mw.ustring.sub(s1Start, j, j) ~= mw.ustring.sub(s2Start, j, j) then
					return i + d - j + 1
				end
			end
		end
		i = i + d
	end

	return sLen
end

-- Symbol from the string in Lua can be obtained only by the sub function,
-- which creates new string with global deduplication. Symbol by symbol
-- comparison is very slow. Diff algorithm is fast on small strings,
-- but on large strings it becomes extremely slowly in Lua.
-- On large strings, approximations are made to ignore minor similarities.
function p.fastStrDiffRanges(s1, s2, accuracyMaxLen)
	local diffs = tryGetWholeDiff(s1, s2)
	if diffs then
		return diffs
	end

	local limit
	local totalLen = s1:len() + s2:len()

	local startIndex
	
	if not accuracyMaxLen then
		accuracyMaxLen = 300
	end
	if totalLen > accuracyMaxLen then
		startIndex = p.findCommonFromStart(s1, s2)
		local endIndex = p.findCommonFromEnd(s1, s2)
		if startIndex then
			startIndex = startIndex + 1
		end
		if startIndex or endIndex then
			if not endIndex then
				endIndex = 0
			end
			s1 = mw.ustring.sub(s1, startIndex or 1, mw.ustring.len(s1) - endIndex)
			s2 = mw.ustring.sub(s2, startIndex or 1, mw.ustring.len(s2) - endIndex)
		end
		totalLen = s1:len() + s2:len()
		if totalLen > accuracyMaxLen then
			limit = totalLen - accuracyMaxLen
		end
	end

	diffs = p.strDiffRanges(s1, s2, limit)
	if startIndex then
		for _, diff in ipairs(diffs) do
			diff[1][1] = diff[1][1] + startIndex - 1
			diff[2][1] = diff[2][1] + startIndex - 1
		end
	end
	return diffs
end

return p
Источник —

Same as Diff