Think Different
- 1 year ago
- 0
- 0
Все тесты пройдены.
Название | Ожидается | Фактически | |
---|---|---|---|
test_fastStrDiffRanges_diff_long_text | |||
test_fastStrDiffRanges_diff_no_timeout_4096 | |||
test_fastStrDiffRanges_diff_no_timeout_512 | |||
test_fastStrDiffRanges_diff_short_in_long | |||
test_fastStrDiffRanges_diff_short_in_long_2diffs | |||
test_fastStrDiffRanges_diff_short_text | |||
test_strDiffRanges_diff_end | |||
test_strDiffRanges_diff_middle | |||
test_strDiffRanges_diff_short_in_long | |||
test_strDiffRanges_diff_start | |||
test_strDiffRanges_empty_str | |||
test_strLongestCommonRange_added | |||
test_strLongestCommonRange_common_end | |||
test_strLongestCommonRange_common_start | |||
test_strLongestCommonRange_different | |||
test_strLongestCommonRange_empty | |||
test_strLongestCommonRange_equal | |||
test_strLongestCommonRange_longExample | |||
test_strLongestCommonRange_removed | |||
test_strLongestCommonRange_singleSymbol |
local p = {}
-- TODO: Провести эксперимент с получением таблицы из символов из строки, проверить
-- будет, ли это быстрее, чем сравнение подстрок. Если представление ustring именно UTF-8,
-- то работа с таблицей может быть в теории быстрее (нет необходимости каждый раз обходить
-- всю строку для вычисления позиции символа).
-- Альтернативно подумать над алгоритмом побайтового сравнения, это должно быть быстрее.
-- TODO: Проверить по производительности в strLongestCommonRange() смену большей
-- и меньшей строк местами.
-- TODO: Использовать find с указанием индекса начала поиска в strLongestCommonRange.
function p.strLongestCommonRange(s1, s2, minWindowSize)
-- TODO: Начало и длина вместо диапазонов
if not minWindowSize then
minWindowSize = 0
end
local longestStart1 = 0
local longestEnd1 = 0
local longestStart2 = 0
local longestEnd2 = 0
local currStart = 1
local currEnd = 1 + minWindowSize
local s1Len = mw.ustring.len(s1)
local s2Len = mw.ustring.len(s2)
while currEnd <= s1Len do
local found = false
local subS1
local subS2
local i = 1
subS1 = mw.ustring.sub(s1, currStart, currEnd)
while i + (currEnd - currStart) <= s2Len do
if not found then
subS2 = mw.ustring.sub(s2, i, i + (currEnd - currStart))
if subS1 == subS2 then
found = true
currEnd = currEnd + 1
if currEnd > s1Len then
break
end
else
i = i + 1
end
else
local currEnd2 = i + currEnd - currStart
if mw.ustring.sub(s1, currEnd, currEnd) == mw.ustring.sub(s2, currEnd2, currEnd2) then
currEnd = currEnd + 1
if currEnd > s1Len then
break
end
else
if currEnd - 1 - currStart > s1Len / 2 then
return { currStart, currEnd - 1 }, { i, i + currEnd - 1 - currStart }
end
subS1 = mw.ustring.sub(s1, currStart, currEnd)
if currEnd - 1 - currStart > longestEnd1 - longestStart1 or longestStart1 == 0 then
longestStart1 = currStart
longestEnd1 = currEnd - 1
longestStart2 = i
longestEnd2 = i + (longestEnd1 - longestStart1)
end
found = false
i = i + 1
end
end
end
if found then
if currEnd - 1 - currStart > longestEnd1 - longestStart1 or longestStart1 == 0 then
longestStart1 = currStart
longestEnd1 = currEnd - 1
longestStart2 = i
longestEnd2 = i + (longestEnd1 - longestStart1)
end
end
currStart = currStart + 1
currEnd = currEnd + 1
end
if longestStart1 == 0 then
return nil, nil
end
return { longestStart1, longestEnd1 }, { longestStart2, longestEnd2 }
end
function p.strCommonRangesApprox(s1, s2, limit, s1Offset, s2Offset, ranges)
ranges = ranges or {}
s1Offset = s1Offset or 0
s2Offset = s2Offset or 0
if s1 == '' or s2 == '' then
return ranges
end
local d1, d2 = p.strLongestCommonRange(s1, s2, limit or 0)
if d1 == nil then
return ranges
end
if limit and d1[2] - d1[1] <= limit and d2[2] - d2[1] <= limit then
return ranges
end
if d1[1] > 1 then
p.strCommonRangesApprox(
mw.ustring.sub(s1, 1, d1[1] - 1),
mw.ustring.sub(s2, 1, d2[1] - 1),
limit,
s1Offset,
s2Offset,
ranges)
end
d1[1] = d1[1] + s1Offset
d1[2] = d1[2] + s1Offset
d2[1] = d2[1] + s2Offset
d2[2] = d2[2] + s2Offset
table.insert(ranges, { d1, d2 })
if d1[2] - s1Offset < mw.ustring.len(s1) and d2[2] - s2Offset < mw.ustring.len(s2) then
p.strCommonRangesApprox(
mw.ustring.sub(s1, d1[2] + 1, mw.ustring.len(s1)),
mw.ustring.sub(s2, d2[2] + 1, mw.ustring.len(s2)),
limit,
d1[2], d2[2],
ranges)
end
return ranges
end
function p.strCommonRanges(s1, s2, s1Offset, s2Offset, ranges)
return p.strCommonRangesApprox(s1, s2, nil, s1Offset, s2Offset, ranges)
end
local function tryGetWholeDiff(s1, s2)
local s1Set = (s1 and s1 ~= '')
local s2Set = (s2 and s2 ~= '')
if s1Set and not s2Set then
return {
{
{ 1, mw.ustring.len(s1) },
{ 1, 0 },
},
}
elseif not s1Set and s2Set then
return {
{
{ 1, 0 },
{ 1, mw.ustring.len(s2) },
},
}
elseif (not s1Set and not s2Set) or (s1 == s2) then
return {}
end
return nil
end
local function strDiffRangesNonNil(s1, s2, limit)
local commonSubstrings = p.strCommonRangesApprox(s1, s2, limit)
if table.getn(commonSubstrings) == 0 then
return {
{
{ 1, mw.ustring.len(s1) },
{ 1, mw.ustring.len(s2) },
},
}
end
local diffs = {}
local curr = commonSubstrings[1]
local last1 = 1
local last2 = 1
if curr[1][1] - last1 > 0 or curr[2][1] - last2 > 0 then
table.insert(diffs, {
{ last1, curr[1][1] - last1 },
{ last2, curr[2][1] - last2 },
})
end
last1 = curr[1][2]
last2 = curr[2][2]
for i = 2, table.getn(commonSubstrings) do
curr = commonSubstrings[i]
if curr[1][1] - last1 > 0 and curr[2][1] - last2 > 0 then
if curr[1][1] - last1 > 0 then
last1 = last1 + 1
end
if curr[2][1] - last2 > 0 then
last2 = last2 + 1
end
table.insert(diffs, {
{ last1, curr[1][1] - last1 },
{ last2, curr[2][1] - last2 },
})
end
last1 = curr[1][2]
last2 = curr[2][2]
end
if mw.ustring.len(s1) - last1 > 0 or mw.ustring.len(s2) - last2 > 0 then
table.insert(diffs, {
{ last1 + 1, mw.ustring.len(s1) - last1 },
{ last2 + 1, mw.ustring.len(s2) - last2 },
})
end
return diffs
end
function p.strDiffRanges(s1, s2, limit)
local diffs = tryGetWholeDiff(s1, s2)
if diffs then
return diffs
else
diffs = {}
end
return strDiffRangesNonNil(s1, s2, limit)
end
function p.findCommonFromStart(s1, s2)
if mw.ustring.sub(s1, 1, 1) ~= mw.ustring.sub(s2, 1, 1) then
return nil
end
local s1Len = mw.ustring.len(s1)
local s2Len = mw.ustring.len(s2)
local sLen
if s1Len < s2Len then
sLen = s1Len
else
sLen = s2Len
end
local i = 1
local d = math.floor(sLen / 10)
if d < 1 then
d = 1
end
while i <= sLen do
local s1Start = mw.ustring.sub(s1, i, i + d)
local s2Start = mw.ustring.sub(s2, i, i + d)
if s1Start ~= s2Start then
for j = 1, d do
if mw.ustring.sub(s1Start, j, j) ~= mw.ustring.sub(s2Start, j, j) then
return i + j - 1 - 1
end
end
end
i = i + d
end
return sLen
end
function p.findCommonFromEnd(s1, s2)
local s1Len = mw.ustring.len(s1)
local s2Len = mw.ustring.len(s2)
if mw.ustring.sub(s1, s1Len, s1Len) ~= mw.ustring.sub(s2, s2Len, s2Len) then
return nil
end
local sLen
if s1Len < s2Len then
sLen = s1Len
else
sLen = s2Len
end
local i = 0
local d = math.floor(sLen / 10)
if d < 1 then
d = 1
end
while i <= sLen do
local s1Start = mw.ustring.sub(s1, s1Len - (i + d), s1Len - i)
local s2Start = mw.ustring.sub(s2, s2Len - (i + d), s2Len - i)
if s1Start ~= s2Start then
for j = d, 1, -1 do
if mw.ustring.sub(s1Start, j, j) ~= mw.ustring.sub(s2Start, j, j) then
return i + d - j + 1
end
end
end
i = i + d
end
return sLen
end
-- Symbol from the string in Lua can be obtained only by the sub function,
-- which creates new string with global deduplication. Symbol by symbol
-- comparison is very slow. Diff algorithm is fast on small strings,
-- but on large strings it becomes extremely slowly in Lua.
-- On large strings, approximations are made to ignore minor similarities.
function p.fastStrDiffRanges(s1, s2, accuracyMaxLen)
local diffs = tryGetWholeDiff(s1, s2)
if diffs then
return diffs
end
local limit
local totalLen = s1:len() + s2:len()
local startIndex
if not accuracyMaxLen then
accuracyMaxLen = 300
end
if totalLen > accuracyMaxLen then
startIndex = p.findCommonFromStart(s1, s2)
local endIndex = p.findCommonFromEnd(s1, s2)
if startIndex then
startIndex = startIndex + 1
end
if startIndex or endIndex then
if not endIndex then
endIndex = 0
end
s1 = mw.ustring.sub(s1, startIndex or 1, mw.ustring.len(s1) - endIndex)
s2 = mw.ustring.sub(s2, startIndex or 1, mw.ustring.len(s2) - endIndex)
end
totalLen = s1:len() + s2:len()
if totalLen > accuracyMaxLen then
limit = totalLen - accuracyMaxLen
end
end
diffs = p.strDiffRanges(s1, s2, limit)
if startIndex then
for _, diff in ipairs(diffs) do
diff[1][1] = diff[1][1] + startIndex - 1
diff[2][1] = diff[2][1] + startIndex - 1
end
end
return diffs
end
return p