模組:Citation/CS1/Language

--[[--------------------------< F O R W A R D   D E C L A R A T I O N S >--------------------------------------
]]
local is_set, in_array, wrap_msg, wrap_style; 
local add_prop_cat, add_maint_cat;

--[[--------------------------< L O C A L _  T A B L E >-------------------------------------------------------

]]

local local_table = {
	['abe']= '西阿贝纳基语',
	['abq']= '阿巴扎语',
	['abq-latn']= '阿巴扎语（拉丁文字）',
	['abs']= '安汶马来语',
	['ady-cyrl']= '阿迪格语（西里尔文）',
	['aeb-arab']= '突尼斯阿拉伯语（阿拉伯文字）',
	['aeb-latn']= '突尼斯阿拉伯语（拉丁文字）',
	['alc']= '阿拉卡卢夫语',
	['ami']= '阿美语',
	['azb']= '南阿塞拜疆语',
	['bat-smg']= '萨莫吉希亚语',
	['bbc-latn']= '多巴巴塔克语（拉丁文字）',
	['bcc']= '南俾路支语',
	['bcl']= '中比科尔语',
	['bdr']= '西海岸巴瑶语',
	['be-tarask']= '白俄罗斯语（传统正写法）',
	['be-x-old']= '白俄罗斯语（传统正写法）',
	['bgp']= '东俾路支语',
	['bh']='博杰普尔语',
	['bxr']='俄罗斯布里亚特语',
	['cja']='西部占语',
	['cja-arab']='西部占语（阿拉伯文字）',
	['cja-cham']='西部占语（阿拉伯文字）',
	['cja-latn']='西部占语（拉丁文字）',
	['cjm']='东部占语',
	['cjm-arab']='东部占语（阿拉伯文字）',
	['cjm-cham']='东部占语（阿拉伯文字）',
	['cjm-latn']='东部占语（拉丁文字）',
	['cjy']='晋语',
	['cjy-hans']='晋语（简体）',
	['cjy-hant']='晋语（繁体）',
	['ckt']='楚科奇语',
	['en-in']= '印度英语',
	['ike-cans']='东加拿大语（原住民音节）',
	['ike-latn']='东加拿大语（拉丁文字）',
	['ruq']='梅格莱诺-罗马尼亚语',
	['ruq-cyrl']='梅格列诺-罗马尼亚语（西里尔文字）',
	['ruq-grek']='梅格列诺-罗马尼亚语（希腊文字）',
	['ruq-latn']='梅格列诺-罗马尼亚语（拉丁文字）',
	['cdo']= '闽东语',
	['cdo-hani']= '闽东语（汉字）',
	['ja-hani'] = '日语（汉字文字）',
	['ja-hira'] = '日语（平假名文字）',
	['ja-hrkt'] = '日语（假名文字）',
	['ja-kana'] = '日语（片假名文字）',
	['ko-kp'] = '朝鲜朝鲜语',
	['kk-cn'] = '中国哈萨克语',
	['no'] = '挪威语',
	['ojp'] = '古日语',
	['ojp-hani'] = '古日语（汉字文字）',
	['ojp-hira'] = '古日语（平假名文字）',
	['tet'] = '德顿语',
	['ug-arab'] = '维吾尔语（阿拉伯文字）',
	['ug-latn'] = '维吾尔语（拉丁文字）',
--	['zh-cn'] = '中国大陆中文',
--	['zh-hans'] = '简体中文',
--	['zh-hant'] = '繁體中文',
--	['zh-hk'] = '香港中文',
	['zh-min-nan'] = '闽南语',
--	['zh-mo'] = '澳門中文',
--	['zh-my'] = '马来西亚中文',
--	['zh-sg'] = '新加坡中文',
--	['zh-tw'] = '臺灣中文',
}

local function fetchLocalLanguageName (code)
	return local_table[code];
end

--[[--------------------------< F O R M A T _ S C R I P T _ V A L U E >----------------------------------------

|script-title= holds title parameters that are not written in Latin based scripts: Chinese, Japanese, Arabic, Hebrew, etc. These scripts should
not be italicized and may be written right-to-left.  The value supplied by |script-title= is concatenated onto Title after Title has been wrapped
in italic markup.

Regardless of language, all values provided by |script-title= are wrapped in <bdi>...</bdi> tags to isolate rtl languages from the English left to right.

|script-title= provides a unique feature.  The value in |script-title= may be prefixed with a two-character ISO639-1 language code and a colon:
	|script-title=ja:*** *** (where * represents a Japanese character)
Spaces between the two-character code and the colon and the colon and the first script character are allowed:
	|script-title=ja : *** ***
	|script-title=ja: *** ***
	|script-title=ja :*** ***
Spaces preceding the prefix are allowed: |script-title = ja:*** ***

The prefix is checked for validity.  If it is a valid ISO639-1 language code, the lang attribute (lang="ja") is added to the <bdi> tag so that browsers can
know the language the tag contains.  This may help the browser render the script more correctly.  If the prefix is invalid, the lang attribute
is not added.  At this time there is no error message for this condition.

Supports |script-title= and |script-chapter=

TODO: error messages when prefix is invalid ISO639-1 code; when script_value has prefix but no script;
]]

local function format_script_value (script_value)
	local lang='';																-- initialize to empty string
	local name;
	if script_value:match('^%l%l%s*:') then										-- if first 3 non-space characters are script language prefix
		lang = script_value:match('^(%l%l)%s*:%s*%S.*');						-- get the language prefix or nil if there is no script
		if not is_set (lang) then
			return '';															-- script_value was just the prefix so return empty string
		end
																				-- if we get this far we have prefix and script
		name = mw.language.fetchLanguageName( lang, mw.getContentLanguage():getCode() );						-- get language name so that we can use it to categorize
		if is_set (name) then													-- is prefix a proper ISO 639-1 language code?
			script_value = script_value:gsub ('^%l%l%s*:%s*', '');				-- strip prefix from script
																				-- is prefix one of these language codes?
			if in_array (lang, {'ar', 'bg', 'bs', 'dv', 'el', 'fa', 'he', 'hy', 'ja', 'ka', 'ko', 'ku', 'mk', 'ps', 'ru', 'sd', 'sr', 'th', 'uk', 'ug', 'yi', 'zh'}) then
				add_prop_cat ('script_with_name', {name, lang})
			else
				add_prop_cat ('script')
			end
			lang = ' lang="' .. lang .. '" ';									-- convert prefix into a lang attribute
		else
			lang = '';															-- invalid so set lang to empty string
		end
	end
	if is_set(script_value) then
		script_value = '-{R|' .. script_value .. '}-';
	end
	script_value = wrap_style('bdi', {lang, script_value});						-- isolate in case script is rtl

	return script_value;
end

--[[--------------------------< S C R I P T _ C O N C A T E N A T E >------------------------------------------

Initially for |title= and |script-title=, this function concatenates those two parameter values after the script value has been 
wrapped in <bdi> tags.
]]

local function script_concatenate (title, script)
	if is_set(title) then
		title = '-{R|' .. title .. '}-';
	end
	if is_set (script) then
		script = format_script_value (script);									-- <bdi> tags, lang atribute, categorization, etc; returns empty string on error
		if is_set (script) then
			title = title .. ' ' .. script;										-- concatenate title and script title
		end
	end
	return title;
end

--[[--------------------------< G E T _ I S O 6 3 9 _ C O D E >------------------------------------------------

Validates language names provided in |language= parameter if not an ISO639-1 code.  Handles the special case that is Norwegian where
ISO639-1 code 'no' is mapped to language name 'Norwegian Bokmål' by Extention:CLDR.

Returns the language name and associated ISO639-1 code.  Because case of the source may be incorrect or different from the case that Wikimedia
uses, the name comparisons are done in lower case and when a match is found, the Wikimedia version (assumed to be correct) is returned along
with the code.  When there is no match, we return the original language name string.

mw.language.fetchLanguageNames() will return a list of languages that aren't part of ISO639-1. Names that aren't ISO639-1 but that are included
in the list will be found if that name is provided in the |language= parameter.  For example, if |language=Samaritan Aramaic, that name will be
found with the associated code 'sam', not an ISO639-1 code.  When names are found and the associated code is not two characters, this function
returns only the Wikimedia language name.

Adapted from code taken from Module:Check ISO 639-1.

]]

local function get_iso639_code (lang)
	if 'norwegian' == lang:lower() then											-- special case related to Wikimedia remap of code 'no' at Extension:CLDR
		return '挪威语', 'no';													-- Make sure rendered version is properly capitalized
	end
	
	local languages = mw.language.fetchLanguageNames (mw.getContentLanguage():getCode(), 'all')				-- get a list of language names known to Wikimedia
																				-- ('all' is required for North Ndebele, South Ndebele, and Ojibwa)
	local langlc = mw.ustring.lower (lang);										-- lower case version for comparisons
	
	for code, name in pairs (languages) do										-- scan the list to see if we can find our language
		if langlc == mw.ustring.lower (name) then
			if 2 ~= code:len() then												-- ISO639-1 codes only
				return name;													-- so return the name but not the code
			end
			return name, code;													-- found it, return name to ensure proper capitalization and the ISO639-1 code
		end
	end
	return lang;																-- not valid language; return language in original case and nil for ISO639-1 code
end

--[[--------------------------< L A N G U A G E _ P A R A M E T E R >------------------------------------------

Get language name from ISO639-1 code value provided.  If a code is valid use the returned name; if not, then use the value that was provided with the language parameter.

There is an exception.  There are three ISO639-1 codes for Norewegian language variants.  There are two official variants: Norwegian Bokmål (code 'nb') and
Norwegian Nynorsk (code 'nn').  The third, code 'no',  is defined by ISO639-1 as 'Norwegian' though in Norway this is pretty much meaningless.  However, it appears
that on enwiki, editors are for the most part unaware of the nb and nn variants (compare page counts for these variants at Category:Articles with non-English-language external links.

Because Norwegian Bokmål is the most common language variant, Media wiki has been modified to return Norwegian Bokmål for ISO639-1 code 'no'. Here we undo that and
return 'Norwegian' when editors use |language=no.  We presume that editors don't know about the variants or can't descriminate between them.

See Help talk:Citation Style_1#An ISO 639-1 language name test

When |language= contains a valid ISO639-1 code, the page is assigned to the category for that code: Category:Norwegian-language sources (no) if
the page is a mainspace page and the ISO639-1 code is not 'en'.  Similarly, if the  parameter is |language=Norwegian, it will be categorized in the same way.

This function supports multiple languages in the form |language=nb, French, th where the language names or codes are separated from each other by commas.

]]

local function language_parameter (lang)
	local code;																	-- the ISO639-1 two character code
	local name;																	-- the language name
	local language_list = {};													-- table of language names to be rendered
	local names_table = {};														-- table made from the value assigned to |language=
	
	if not is_set (lang) then
		return '';
	end
	
	names_table = mw.text.split (lang, '%s*,%s*');								-- names should be a comma separated list

	for _, lang in ipairs (names_table) do										-- reuse lang
		
		name = fetchLocalLanguageName (lang:lower());							-- local table first
		
		if not is_set (name) then
			if lang:match ('^%a%a%-') or 2 == lang:len() then					-- ISO639-1 language code are 2 characters (fetchLanguageName also supports 3 character codes)
				if lang:match ('^zh-') then
					name = mw.language.fetchLanguageName (lang:lower(), lang:lower());
				else
					if not is_set (name) then
						name = mw.language.fetchLanguageName (lang:lower(), mw.getContentLanguage():getCode());			
																				-- get ISO 639-1 language name if Language is a proper code
					end
				end
			end
		end
	
		if is_set (name) then													-- if Language specified a valid ISO639-1 code
			code = lang:lower();												-- save it
		else
			name, code = get_iso639_code (lang);								-- attempt to get code from name (assign name here so that we are sure of proper capitalization)
		end
	
		if is_set (code) then
			if 'zh' ~= code and not code:match ('^zh-') then					-- Chinese not the language
				add_prop_cat ('foreign_lang_source', {name, code})
			end
		else
			add_maint_cat ('unknown_lang');										-- add maint category if not already added
		end
		
		table.insert (language_list, name);
		name = '';																-- so we can reuse it
	end
	
	code = #language_list														-- reuse code as number of languages in the list
	if 2 >= code then
		name = table.concat (language_list, '及')								-- insert '及' between two language names
	elseif 2 < code then
		language_list[code] = '及' .. language_list[code];						-- prepend last name with '及'
		name = table.concat (language_list, '、');								-- and concatenate with '<comma><space>' separators
		name = name:gsub ('、及', '及', 1);
	end
	return (" " .. wrap_msg ('language', name));								-- otherwise wrap with '(in ...)'
end
--[[--------------------------< S E T _ S E L E C T E D _ M O D U L E S >--------------------------------------
]]

local function set_selected_modules (utilities_page_ptr, error_page_ptr)
	
	is_set = utilities_page_ptr.is_set;											-- import functions from selected Module:Citation/CS1/Utilities module
	in_array = utilities_page_ptr.in_array;
	wrap_style = utilities_page_ptr.wrap_style;
	wrap_msg = utilities_page_ptr.wrap_msg;
	
	add_prop_cat = error_page_ptr.add_prop_cat;									-- import functions from selected Module:Citation/CS1/Error module
	add_maint_cat = error_page_ptr.add_maint_cat;
end


--[[--------------------------< E X P O R T E D   F U N C T I O N S >------------------------------------------
]]

return {
	script_concatenate = script_concatenate,
	language_parameter =language_parameter,
	
	set_selected_modules = set_selected_modules
	}