Hopp til innhald

Modul:scripts/findBestScript

Frå Wiktionary - den frie ordboka

Dokumentasjon for modulen kan opprettast på Modul:scripts/findBestScript/dok

return function (export, text, lang, scripts, forceDetect)
	--[=[
		Remove any HTML entities; catfix function in [[Module:utilities]]
		adds tagging to a no-break space ( ), which contains Latin characters;
		hence Latin was returned as the script if "Latn" is one of the language's scripts.
	]=]
	text = string.gsub(text, "&[a-zA-Z0-9]+;", "")
	
	-- Try to match every script against the text,
	-- and return the one with the most matching characters.
	local bestcount = 0
	local bestscript = nil
	
	-- Get length of text minus any spacing or punctuation characters.
	-- Counting instances of UTF-8 character pattern is faster than mw.ustring.len.
	local _, length = string.gsub(mw.ustring.gsub(text, "[%s%p]+", ""), "[\1-\127\194-\244][\128-\191]*", "")
	
	if length == 0 then
		return export.getByCode("None")
	end
	
	for i, script in ipairs(scripts) do
		local count = script:countCharacters(text)
		
		-- Special case for "Hani" (general Han), which is overridden by "Hant" (traditional) or "Hans" (simplified) if they have matched at least one character. Otherwise, "Hani" would be selected in situations where an exclusively traditional or simplified character is used in a string with a shared character, as "Hani" would match both, while the other two will not match with the shared character. This is to prevent having to include all shared characters on both the traditional and simplified character lists.
		if script._code == "Hani" and count >= 1 and bestscript and (bestscript._code == "Hant" or bestscript._code == "Hans") then
			return bestscript
		end
		
		if count >= length then
			return script
		end
		
		if count > bestcount then
			bestcount = count
			bestscript = script
		end
	end
	
	if bestscript then
		return bestscript
	end
	
	-- No matching script was found. Return "None".
	return export.getByCode("None")
end