Jump to content

Module:Jpan-headword

From Wiktionary, the free dictionary

This implements Japanese headword-line templates and all of the associated templates that they called to do categorization and error checking.


local m_ja = require("Module:ja")
local m_ja_ruby = require("Module:ja-ruby")
local m_str_utils = require("Module:string utilities")

local byteoffset = mw.ustring.byteoffset
local concat = table.concat
local insert = table.insert
local kana_to_romaji = require("Module:Hrkt-translit").tr
local maxn = table.maxn or require("Module:table").maxIndex  -- maxn is deprecated; maxIndex is not strictly equivalent, but equivalent enough here
local moraify = m_ja.moraify
local remove = table.remove
local ugmatch = mw.ustring.gmatch
local ugsub = m_str_utils.gsub
local ulen = m_str_utils.len
local ulower = m_str_utils.lower
local umatch = mw.ustring.match
local usub = m_str_utils.sub
local gsplit = m_str_utils.gsplit

local export = {}
local pos_functions = {}

local range = mw.loadData('Module:ja/data/range')
local Jpan = require("Module:scripts").getByCode("Jpan")

local function remove_links(text)
	return (text:gsub("%[%[[^|%]]-|", "")
		:gsub("%[%[", "")
		:gsub("%]%]", ""))
end

local function assign_kana_to_kanji(head, kana, pagename, template_name)
	local m_tu = require'Module:template utilities'

	local kanji_pos = {[0] = { nil, 0}}
	local head_nolink = {}
	local link_border = 0
	local function insert_kanji_pos(substr)
		insert(head_nolink, substr)
		for p1, w1 in ugmatch(substr, '()([々' .. range.kanji .. '])') do
			p1 = byteoffset(substr, p1) + link_border
			insert(kanji_pos, { p1, p1 + w1:len() - 1 })
		end
	end
	for p1, p2, w1 in m_tu.gfind_bracket(head, {['%[%['] = ']]'}) do
		insert_kanji_pos(head:sub(link_border + 1, p1 - 1))
		
		local p_pipe = w1:find'|' or 2
		link_border = p1 + p_pipe - 1
		insert_kanji_pos(w1:sub(p_pipe + 1, -3))
		
		link_border = p2
	end
	insert_kanji_pos(head:sub(link_border + 1))
	head_nolink = concat(head_nolink)

	local pagetext = mw.title.new(pagename):getContent()
	if not pagetext then return head, kana end

	local non_kanji = {}
	local last_kanji = 1
	for p1 in ugmatch(head_nolink, '[々' .. range.kanji .. ']()') do
		insert(non_kanji, usub(head_nolink, last_kanji, p1 - 2))
		last_kanji = p1
	end
	insert(non_kanji, usub(head_nolink, last_kanji))

	for kanjitab in pagetext:gmatch('(){{%s*' .. template_name) do
		kanjitab = select(3, m_tu.find_bracket(pagetext, m_tu.brackets_temp, kanjitab))
		if not kanjitab then error('ill-formed [[t:' .. template_name:gsub('%%', '') .. ']] syntax') end
		kanjitab = m_tu.parse_temp(kanjitab)
		
		local readings = {}
		local readings_len = {}
		
		for i = 1, maxn(kanjitab.args) do
			local r_i = kanjitab.args[i] or ''
			local r_o = kanjitab.args['o' .. i] or ''
			if kanjitab.args['k' .. i] then
				readings[i] = kanjitab.args['k' .. i] .. r_o
				readings_len[i] = tonumber(r_i:match'^%s*%D*(%d*)%s*$') or 1
			else
				local r_kana, r_len = r_i:match'^%s*(%D*)(%d*)%s*$'
				readings[i] = r_kana .. r_o
				readings_len[i] = tonumber(r_len) or 1
			end
		end

		local kana_decom = {}
		local reading_id = 1
		local reading_len = 1
		for i = 1, #non_kanji - 1 do
			if reading_len <= 1 then
				reading_len = readings_len[reading_id] or 1

				insert(kana_decom, non_kanji[i])
				insert(kana_decom, readings[reading_id])

				reading_id = reading_id + 1
			else
				reading_len = reading_len - 1
			end
		end
		insert(kana_decom, non_kanji[#non_kanji])
		
		local function strip_nonkana(str, repl)
			return ugsub(str, '[^' .. range.kana .. ']+', repl) or nil
		end
		local xeno_reading = {strip_nonkana(kana, ''):match('^' .. strip_nonkana(concat(kana_decom), '(.-)') .. '$')}
		if #xeno_reading > 0 then
			local head_decom = {}
			reading_id = 1
			reading_len = 1
			for i = 1, #non_kanji - 1 do
				if reading_len <= 1 then
					reading_len = readings_len[reading_id] or 1

					insert(head_decom, head:sub(kanji_pos[i - 1][2] + 1, kanji_pos[i][1] - 1))
					insert(head_decom, head:sub(kanji_pos[i][1], kanji_pos[i + reading_len - 1][2]))

					reading_id = reading_id + 1
				else
					reading_len = reading_len - 1
				end
			end
			insert(head_decom, head:sub(kanji_pos[#non_kanji - 1][2] + 1))
			
			if #head_decom ~= #kana_decom then error('number of parameters in [[t:' .. template_name:gsub('%%', '') .. ']] is incorrect') end
			
			local n_xeno_reading = 0
			for i = 1, #kana_decom, 2 do
				kana_decom[i] = ugsub(kana_decom[i], '[^' .. range.kana .. ']+', function()
					n_xeno_reading = n_xeno_reading + 1
					if xeno_reading[n_xeno_reading] == '' then return nil
					else return xeno_reading[n_xeno_reading] end
				end)
			end
			
			return concat(head_decom, '%'), concat(kana_decom, '%')
		end
	end

	return head, kana
end

local en_grades = {
	"first grade", "second grade", "third grade",
	"fourth grade", "fifth grade", "sixth grade",
	"secondary school", "jinmeiyō", "hyōgai"
}

local aliases = {
	['transitive']='tr', ['trans']='tr',
	['intransitive']='in', ['intrans']='in', ['intr']='in',
	['godan']='1', ['ichidan']='2', ['irregular']='irr'
}

local adverbs_optional_tag = 'optionally '
local adverbs_optional_aliases = {
	['to']='と', ['と']='と', ['ト']='と',
	['ni']='に', ['に']='に', ['ニ']='に',
}
local adverbs_optional_links = {
	['と']='[[と#Japanese:_adverbs|と]]',
	['に']='[[に]]',
}

local function formatting_adjustments(rom, kana, pos_category)
	-- hyphens for prefixes, suffixes, and counters (classifiers)
	if pos_category == "prefixes" then
		rom = rom:gsub('%-?$', '-')
	elseif pos_category == "suffixes" or pos_category == "suffix forms" or pos_category == "counters" or pos_category == "classifiers" then
		rom = rom:gsub('^%-?', '-')
	elseif pos_category == "proper nouns" and not kana:match'%^' then -- automatic caps for proper nouns, if not already specified
		rom = ugsub(ugsub(rom, '%f[^%s%c%p]%l', string.uupper), "%w'%u", ulower) -- no caps after medial apostrophes
	end
	return rom
end

local function kana_to_romaji_with_pos_format(kana, data, args)
	if data.headword.pos_category == "combining forms" or data.headword.pos_category == "punctuation marks" or data.headword.pos_category == "iteration marks" then
		return "-"
	end
	
	local rom = remove_links(kana_to_romaji(kana, data.lang_code))
	
	-- make adjustments for -u verbs and -i adjectives
	if args['infl'] == '1' or args['infl'] == '1s' or args['infl'] == 'godan' then
		rom = rom:gsub('ō$', 'ou'):gsub('ū$', 'uu')
	elseif args['infl'] == 'i' or args['infl'] == 'is' or args['infl'] == 'い' then
		rom = rom:gsub('ī$', 'ii')
	end
	
	return formatting_adjustments(rom, kana, data.headword.pos_category)
end

local function iterate_rare_chars(text)
	local ch, i
	return function()
		repeat
			ch, i = umatch(text, "([" .. range.kana .. range.kana_graph .. "!-/:-@%[\\-`×△○◎。-〠〶〷〻-〽・·゠=~][゙゚]*)()", i)
		until not (ch and umatch(ch, "^[ぁ-ちっつて-ろんァ-チッツテ-ロンヲ-゚]$"))
		return ch
	end
end

local function historical_kana(data, hist_kana, modern_kana)
	-- Disallow historical kana for kana and morae, as there's no one-to-one correspondence.
	local pos = data.headword.pos_category
	if pos == "syllables" or pos == "kana" or pos == "morae" then
		error(("Cannot specify historical kana for %s."):format(pos))
	end
	local hist_kana_no_formatting = hist_kana:gsub("[%^%-%. %%]+", "")
	local rare_chars, lang_name, hc = {}, data.lang_name, data.headword.categories
	for ch in iterate_rare_chars(hist_kana_no_formatting) do
		if not (modern_kana and modern_kana:find(ch)) then
			rare_chars[ch] = true
		end
	end
	for _, mora in ipairs(moraify((ugsub(hist_kana_no_formatting, "[^" .. range.kana .. "]+", " ")))) do
		if not (mora:gsub(" +", ""):match("^.?[\128-\191]*$") or (modern_kana and modern_kana:find(mora))) then
			rare_chars[mora] = true
		end
	end
	for ch in pairs(rare_chars) do
		insert(hc, lang_name .. " terms historically spelled with " .. ch)
	end
	insert(data.info_hist, require("Module:ja-link").link({
		lang = data.headword.lang,
		lemma = hist_kana,
		tr = formatting_adjustments(
			remove_links(kana_to_romaji(hist_kana, data.lang_code, nil, {hist = true})),
			hist_kana,
			pos
		)
	}, {
		face = "head",
		disableSelfLink = true,
	}))
end

local function detect_pagename_kana(data, digraphs)
	local pagename = data.pagename
	-- Exclude "&" and "@", which are part of %p (e.g. リズム&ブルース).
	local function remove_kana(m)
		return m:match("[&@]") or ""
	end
	
	if ugsub(pagename, '[%p%s%c' .. range.hiragana .. (digraphs and "ゟ" or "") .. ']', remove_kana) == "" then
		return 'hira'
	elseif ugsub(pagename, '[%p%s%c' .. range.katakana .. (digraphs and "ヿ" or "") .. ']', remove_kana) == "" then
		return 'kata'
	elseif ugsub(pagename, '[%p%s%c' .. range.kana .. (digraphs and "ゟヿ" or "") .. ']', remove_kana) == "" then
		return 'both'
	end
end

-- go through args and build inflections by finding whatever kanas were given to us
local function format_headword(args, data)
	local pagename, kanas, lang_name = data.pagename, data.kanas, data.lang_name
	data.pagename_kana = detect_pagename_kana(data)
	
	if args[1][1] and not args[1][1]:match'[\128-\255]' then
		-- filter out POS designations
		remove(args[1], 1)
	end
	
	local linked_translit = data.headword.lang:link_tr(Jpan)
	local suru_ending, rom_suru_ending
	if data.headword.pos_category == "suru verbs" then
		suru_ending = "[[する]]"
		rom_suru_ending = linked_translit and " [[suru]]" or " suru"
	else
		suru_ending, rom_suru_ending = "", ""
	end
	
	if data.pagename_kana then -- pure-kana-title entry
		if #args.head > 0 or args.head.default then
			insert(data.headword.categories, lang_name .. " terms with redundant head parameter")
		end
		
		-- {{ja-xxx}} vs {{ja-xxx|こ.うし}} vs {{ja-xxx|コウシ}} in [[こうし]]
		if not args[1][1] then
			args[1][1] = pagename
		elseif remove_links(args[1][1]:gsub("[%^%-%. %%]+", "")) ~= pagename then
			insert(args[1], 1, pagename)
		end
		
		for i, k in ipairs(args[1]) do
			insert(data.headword.heads, {
				term = k:gsub("[%^%-%. %%]+", "") .. suru_ending,
				tr = '-',
				l = args.label[i] and {args.label[i]} or nil,
			})
		end
		
		for i = 1, math.max(args.rom.maxindex, 1) do
			local rom = args.rom[i] or args.rom.default or kana_to_romaji_with_pos_format(args[1][1], data, args)
			if not data.headword.heads[i] then
				data.headword.heads[i] = {term = data.headword.heads[i-1].term}
			end
			if rom == "-" then
				data.headword.heads[i].tr = "-"
			elseif linked_translit then
				data.headword.heads[i].tr = "[[" .. rom .. "]]" .. rom_suru_ending
			else
				data.headword.heads[i].tr = rom .. rom_suru_ending
			end
			
			if not data.inflection_base.form then
				data.inflection_base.form = remove_links(args[i][1]:gsub("[%^%-%. %%]+", "")) .. suru_ending
				data.inflection_base.romaji = rom .. rom_suru_ending
			end
		end
		
		kanas[1] = pagename
		
		if args.hist[1] then
			historical_kana(data, args.hist[1], args[1][1])
		end
	else -- non-pure-kana-title entry
		if #args[1] == 0 and not (data.headword.pos_category == "punctuation marks" or data.headword.pos_category == "iteration marks" or data.headword.pos_category == "symbols") then
			error("Kana form is required.")
		end
		if args.head.default == pagename then
			insert(data.headword.categories, lang_name .. " terms with redundant head parameter")
		end
		
		local rom_repetition_final = {}
		for i, k in ipairs(args[1]) do
			local rom_auto = kana_to_romaji_with_pos_format(k, data, args)
			local head = args.head[i] or args.head.default or pagename
			if args.head[i] == pagename then
				insert(data.headword.categories, lang_name .. " terms with redundant head parameter")
			end
			
			local head_for_ruby, kana_for_ruby
			if ulen(head) > 1 and head:match'%%' == nil and k:match'%%' == nil then
				head_for_ruby, kana_for_ruby = assign_kana_to_kanji(head, k, pagename, data.lang_code .. '%-kanjitab')
			else
				head_for_ruby, kana_for_ruby = head, k
			end
			local format_table = m_ja_ruby.parse_text(head_for_ruby, kana_for_ruby, {
				try = 'force',
				try_force_limit = 10000
			})
			local kana_bare = remove_links(k:gsub("[%^%-%. %%]+", ""))
			local rom = args.rom[i] or args.rom.default or rom_auto
			
			head = {
				term = m_ja_ruby.to_wiki(format_table, {
					break_link = true,
				}):gsub('<rt>(..-)</rt>', "<rt>[[" .. kana_bare .."|%1]]</rt>") .. suru_ending,
				l = args.label[i] and {args.label[i]} or nil,
			}
			if rom == "-" or rom_repetition_final[rom] then
				head.tr = "-"
			elseif linked_translit then
				head.tr = "[[" .. rom .. "]]" .. rom_suru_ending
			else
				head.tr = rom .. rom_suru_ending
			end
			insert(data.headword.heads, head)
			
			rom_repetition_final[rom] = true
			insert(kanas, kana_bare)
			
			if args.hist[i] then
				historical_kana(data, args.hist[i], k)
			end
			
			if not data.inflection_base.form then
				data.inflection_base.form = remove_links(m_ja_ruby.to_markup(format_table)) .. suru_ending
				data.inflection_base.romaji = rom .. rom_suru_ending
			end
		end
		
		local first_reading, multiple = kanas[1]
		if not first_reading then
			return
		end
		first_reading = ulower(kana_to_romaji(first_reading, lang_code)):gsub("%%", "")
		
		for i = 2, #kanas do
			if ulower(kana_to_romaji(kanas[i], lang_code)):gsub("%%", "") ~= first_reading then
				multiple = true
				break
			end
		end
		
		if not multiple then
			local lang_code = data.lang_code
			local content = mw.title.getCurrentTitle():getContent()
			local loc1, loc2 = content:find("%f[^%z%s]==%s*" .. lang_name:gsub("%-", "%%%-") .. "%s*==()")
			loc2 = content:find("%f[^%z%s]==[^\n=]+==", loc2)
			if loc1 then
				content = content:sub(loc1, loc2)
				for template in require("Module:template parser").find_templates(content) do
					local name, reading = template:get_name()
					if (
						name == lang_code .. "-head" or
						name == lang_code .. "-pos"
					) then
						reading = template:get_arguments()[2]
						if reading ~= nil then
							reading = remove_links(reading):gsub("%%", "")
						end
					elseif (
						name == lang_code .. "-noun" or
						name == lang_code .. "-verb" or
						name == lang_code .. "-adj" or
						name == lang_code .. "-phrase" or
						name == lang_code .. "-verb form" or
						name == lang_code .. "-verb-suru"
					) then
						reading = template:get_arguments()[1]
						if reading ~= nil then
							reading = remove_links(reading):gsub("%%", "")
						end
					elseif name == lang_code .. "-see" then
						reading = template:get_arguments()[1]
						if reading ~= nil then
							reading = remove_links(reading):gsub("%%", "")
						end
					--	if umatch(reading, "[^" .. range.kana .. "]") then
							-- TODO: check linked page
					--	end
					end
					if reading and ulower(kana_to_romaji(reading, lang_code)):gsub("%%", "") ~= first_reading then
						multiple = true
					end
				end
			end
		end
		
		if multiple then
			insert(data.headword.categories, lang_name .. " terms with multiple readings")
		end
	end
end

local function add_transitivity(data, tr)
	tr = aliases[tr] or tr
	if tr == "tr" then
		insert(data.info_mid, 'transitive')
		insert(data.headword.categories, data.lang_name .. " transitive verbs")
	elseif tr == "in" then
		insert(data.info_mid, 'intransitive')
		insert(data.headword.categories, data.lang_name .. " intransitive verbs")
	elseif tr == "both" then
		insert(data.info_mid, 'transitive or intransitive')
		insert(data.headword.categories, data.lang_name .. " transitive verbs")
		insert(data.headword.categories, data.lang_name .. " intransitive verbs")
	else
		insert(data.headword.categories, data.lang_name .. " verbs without transitivity")
	end
end

local function get_final(lemma, data)
	return kana_to_romaji(remove(moraify(m_ja_ruby.to_ruby(m_ja_ruby.parse_markup(lemma)))), data.lang_code)
end

local function add_inflections(data, inflection_type, cat_suffix)
	local lemma = data.inflection_base.form
	local romaji = data.inflection_base.romaji
	inflection_type = aliases[inflection_type] or inflection_type
	
	local function replace_suffix(lemma_from, lemma_to, romaji_from, romaji_to)
		-- e.g. 持って来る, lemma = "[持](も)って来(く)る"
		-- lemma_from = "くる", lemma_to = {"き","きた"}
		local result = {}
		local pattern_from, n_from = lemma_from:gsub('.[\128-\191]*', function(c)
			return '[' .. c .. m_ja.hira_to_kata(c) .. ']([^' .. range.kana .. ']*)'
		end)
		
		pattern_from = pattern_from .. '$'
		-- "[くク]([^kana range]*)[るル]([^kana range]*)$"
		
		for i_lemma_to, s_lemma_to in ipairs(lemma_to) do
			local n_to = 0
			local pattern_to = s_lemma_to:gsub('.[\128-\191]*', function(c)
				if n_to < n_from then
					n_to = n_to + 1
					return c .. "%" .. n_to
				else
					return c
				end
			end)
			
			for i = n_to + 1, n_from do
				pattern_to = pattern_to .. "%" .. i
			end
			-- "き%1%2", "き%1た%2"
			
			local lemma_inflected, success = ugsub(lemma, pattern_from, pattern_to)
			if success == 0 then
				return
			end
			
			local romaji_inflected
			romaji_inflected, success = romaji:gsub(romaji_from .. "$", romaji_to[i_lemma_to])
			if success == 0 then
				romaji_inflected, success = romaji:gsub("%[%[" .. romaji_from .. "%]%]$", "[[" .. romaji_to[i_lemma_to] .. "]]")
				if success == 0 then
					return
				end
			end
			
			insert(result, {lemma = lemma_inflected, romaji = romaji_inflected})
		end
		return result -- {{lemma="[持](も)って来(き)",romaji="motteki"},{lemma="[持](も)って来(き)た",romaji="mottekita"}}
	end

	local function insert_form(label, ...)
		-- label = "stem" or "past" etc.
		-- ... = {lemma=...,romaji=...},{lemma=...,romaji=...}
		local labeled_forms = {label = label}
		for _, v in ipairs{...} do
			local table_form = m_ja_ruby.parse_markup(v.lemma)
			local form_term = m_ja_ruby.to_wiki(table_form)
			if not form_term:find'%[%[.+%]%]' then
				form_term = '[[' .. m_ja_ruby.to_text(table_form) .. '#' .. data.lang_name .. '|' .. form_term .. ']]'
			end
			insert(labeled_forms, {
				term = form_term,
				translit = v.romaji,
			})
		end
		insert(data.headword.inflections, labeled_forms)
	end

	local inflected_forms
	if data.lang_code == 'ja' then
		if inflection_type == '1' or inflection_type == '1s' then
			insert(data.info_mid, '<abbr title="godan (group 1) conjugation">godan</abbr>')
			if cat_suffix then
				insert(data.headword.categories, data.lang_name .. " godan " .. cat_suffix)
				local romaji = data.inflection_base.romaji
				if cat_suffix == "verbs" then
					local final = get_final(lemma, data)
					insert(data.headword.categories, data.lang_name .. " godan " .. cat_suffix .. " ending with -" .. final)
					if final == "ru" then
						if umatch(romaji, "[iIīĪ]ru$") then
							insert(data.headword.categories, data.lang_name .. " godan " .. cat_suffix .. " ending with -iru")
						elseif umatch(romaji, "[eEēĒ]ru$") then
							insert(data.headword.categories, data.lang_name .. " godan " .. cat_suffix .. " ending with -eru")
						end
					end
				end
			end
			if inflection_type == '1' then
				inflected_forms =
					replace_suffix('く', {'き', 'いた'}, 'ku', {'ki', 'ita'}) or
					replace_suffix('ぐ', {'ぎ', 'いだ'}, 'gu', {'gi', 'ida'}) or
					replace_suffix('す', {'し', 'した'}, 'su', {'shi', 'shita'}) or
					replace_suffix('つ', {'ち', 'った'}, 'tsu', {'chi', 'tta'}) or
					replace_suffix('ぬ', {'に', 'んだ'}, 'nu', {'ni', 'nda'}) or
					replace_suffix('ぶ', {'び', 'んだ'}, 'bu', {'bi', 'nda'}) or
					replace_suffix('む', {'み', 'んだ'}, 'mu', {'mi', 'nda'}) or
					replace_suffix('る', {'り', 'った'}, 'ru', {'ri', 'tta'}) or
					replace_suffix('う', {'い', 'った'}, 'u', {'i', 'tta'})
				if inflected_forms then
					insert_form('stem', inflected_forms[1])
					insert_form('past', inflected_forms[2])
				else
					require'Module:debug'.track'Jpan-headword/inflection failed/ja'
				end
			else
				inflected_forms =
					replace_suffix('る', {'り', 'った', 'い'}, 'ru', {'ri', 'tta', 'i'}) or --くださる
					replace_suffix('いく', {'いき', 'いった'}, 'iku', {'iki', 'itta'}) or --行く
					replace_suffix('う', {'い', 'うた'}, 'ou', {'oi', 'ōta'}) --問う
				if inflected_forms then
					insert_form('stem', inflected_forms[1], inflected_forms[3])
					insert_form('past', inflected_forms[2])
				else
					require'Module:debug'.track'Jpan-headword/inflection failed/ja'
				end
			end
		elseif inflection_type == '2' then
			insert(data.info_mid, '<abbr title="ichidan (group 2) conjugation">ichidan</abbr>')
			if cat_suffix then
				insert(data.headword.categories, data.lang_name .. " ichidan " .. cat_suffix)
				local romaji = data.inflection_base.romaji
				if umatch(romaji, "[iIīĪ]ru$") then
					insert(data.headword.categories, data.lang_name .. " kami ichidan " .. cat_suffix)
				elseif umatch(romaji, "[eEēĒ]ru$") then
					insert(data.headword.categories, data.lang_name .. " shimo ichidan " .. cat_suffix)
				else
					insert(data.headword.categories, data.lang_name .. " irregular " .. cat_suffix)
				end
			end
			inflected_forms = replace_suffix('る', {'', 'た'}, 'ru', {'', 'ta'})
			if inflected_forms then
				insert_form('stem', inflected_forms[1])
				insert_form('past', inflected_forms[2])
			else
				require'Module:debug'.track'Jpan-headword/inflection failed/ja'
			end
		elseif inflection_type == 'suru' then
			insert(data.info_mid, '<abbr title="suru (group 3) conjugation">suru</abbr>')
			if cat_suffix then
				insert(data.headword.categories, data.lang_name .. " suru " .. cat_suffix)
			end
			inflected_forms =
				replace_suffix('する', {'し', 'した'}, 'suru', {'shi', 'shita'}) or
				replace_suffix('ずる', {'じ', 'じた'}, 'zuru', {'ji', 'jita'})
			if inflected_forms then
				insert_form('stem', inflected_forms[1])
				insert_form('past', inflected_forms[2])
			else
				require'Module:debug'.track'Jpan-headword/inflection failed/ja'
			end
		elseif inflection_type == 'kuru' then
			insert(data.info_mid, '<abbr title="kuru (group 3) conjugation">kuru</abbr>')
			if cat_suffix then
				insert(data.headword.categories, data.lang_name .. " kuru " .. cat_suffix)
			end
			inflected_forms = replace_suffix('くる', {'き', 'きた'}, 'kuru', {'ki', 'kita'})
			if inflected_forms then
				insert_form('stem', inflected_forms[1])
				insert_form('past', inflected_forms[2])
			else
				require'Module:debug'.track'Jpan-headword/inflection failed/ja'
			end
		elseif inflection_type == 'i' or inflection_type == 'い' then
			insert(data.info_mid, '<abbr title="-i (type I) inflection">-i</abbr>')
			if cat_suffix then
				insert(data.headword.categories, data.lang_name .. " い-i " .. cat_suffix)
			end
			inflected_forms = replace_suffix('い', {'く'}, 'i', {'ku'})
			if inflected_forms then
				insert_form('adverbial', inflected_forms[1])
			else
				require'Module:debug'.track'Jpan-headword/inflection failed/ja'
			end
		elseif inflection_type == 'is' then
			insert(data.info_mid, '<abbr title="-i (type I) inflection">-i</abbr>')
			if cat_suffix then
				insert(data.headword.categories, data.lang_name .. " い-i " .. cat_suffix)
			end
			inflected_forms = replace_suffix('いい', {'よく'}, 'ii', {'yoku'})
			if inflected_forms then
				insert_form('adverbial', inflected_forms[1])
			else
				require'Module:debug'.track'Jpan-headword/inflection failed/ja'
			end
		elseif inflection_type == 'na' or inflection_type == 'な' then
			insert(data.info_mid, '<abbr title="-na (type II) inflection">-na</abbr>')
			if cat_suffix then
				insert(data.headword.categories, data.lang_name .. " な-na " .. cat_suffix)
			end
			inflected_forms = replace_suffix('', {'[[な]]', '[[に]]'}, '', {' [[na]]', ' [[ni]]'})
			insert_form('adnominal', inflected_forms[1])
			insert_form('adverbial', inflected_forms[2])

		elseif inflection_type == "yo" then
			insert(data.info_mid, '<abbr title="yodan conjugation (classical)"><sup><small>†</small></sup>yodan</abbr>')
			if cat_suffix then
				insert(data.headword.categories, data.lang_name .. " yodan " .. cat_suffix)
				insert(data.headword.categories, data.lang_name .. " yodan " .. cat_suffix .. " ending with -" .. get_final(lemma, data))
			end
		elseif inflection_type == "kami ni" then
			insert(data.info_mid, '<abbr title="kami nidan conjugation (classical)"><sup><small>†</small></sup>nidan</abbr>')
			if cat_suffix then
				insert(data.headword.categories, data.lang_name .. " nidan " .. cat_suffix)
				insert(data.headword.categories, data.lang_name .. " kami nidan " .. cat_suffix)
			end
		elseif inflection_type == "shimo ni" then
			insert(data.info_mid, '<abbr title="shimo nidan conjugation (classical)"><sup><small>†</small></sup>nidan</abbr>')
			if cat_suffix then
				insert(data.headword.categories, data.lang_name .. " nidan " .. cat_suffix)
				insert(data.headword.categories, data.lang_name .. " shimo nidan " .. cat_suffix)
			end
		elseif inflection_type == "rahen" then
			insert(data.info_mid, '<abbr title="r-special conjugation (classical)"><sup><small>†</small></sup>-ri</abbr>')
		elseif inflection_type == "sahen" then
			insert(data.info_mid, '<abbr title="s-special conjugation (classical)"><sup><small>†</small></sup>-se</abbr>')
		elseif inflection_type == "kahen" then
			insert(data.info_mid, '<abbr title="k-special conjugation (classical)"><sup><small>†</small></sup>-ko</abbr>')
		elseif inflection_type == "nahen" then
			insert(data.info_mid, '<abbr title="n-special conjugation (classical)"><sup><small>†</small></sup>-n</abbr>')
		elseif inflection_type == "nari" or inflection_type == "なり" then
			insert(data.info_mid, '<abbr title="-nari inflection (classical)"><sup><small>†</small></sup>-nari</abbr>')
			if cat_suffix then
				insert(data.headword.categories, data.lang_name .. " なり-nari " .. cat_suffix)
			end
		elseif inflection_type == 'tari' or inflection_type == 'たり' then
			insert(data.info_mid, '<abbr title="-tari inflection (classical)"><sup><small>†</small></sup>-tari</abbr>')
			if cat_suffix then
				insert(data.headword.categories, data.lang_name .. " たり-tari " .. cat_suffix)
			end
			inflected_forms = replace_suffix('', {'[[とした]]', '[[たる]]', '[[と]]', '[[として]]'}, '', {' [[to shita]]', ' [[taru]]', ' [[to]]', ' [[to shite]]'})
			insert_form('adnominal', inflected_forms[1], inflected_forms[2])
			insert_form('adverbial', inflected_forms[3], inflected_forms[4])
		elseif inflection_type == "ku" or inflection_type == "く" then
			insert(data.info_mid, '<abbr title="-ku inflection (classical)"><sup><small>†</small></sup>-ku</abbr>')
			if cat_suffix then
				insert(data.headword.categories, data.lang_name .. " く-ku " .. cat_suffix)
			end
		elseif inflection_type == "shiku" or inflection_type == "しく" then
			insert(data.info_mid, '<abbr title="-shiku inflection (classical)"><sup><small>†</small></sup>-shiku</abbr>')
			if cat_suffix then
				insert(data.headword.categories, data.lang_name .. " しく-shiku " .. cat_suffix)
			end
		elseif inflection_type == "ka" or inflection_type == "か" then
			insert(data.info_mid, '<abbr title="-ka inflection (dialectal)"><sup><small>†</small></sup>-ka</abbr>')
			if cat_suffix then
				insert(data.headword.categories, data.lang_name .. " か-ka " .. cat_suffix)
			end
		elseif inflection_type and inflection_type:len() > adverbs_optional_tag:len() and inflection_type:sub(1, adverbs_optional_tag:len()) == adverbs_optional_tag then
			adverbs_optional_list = inflection_type:sub(adverbs_optional_tag:len() + 1)
			for option in gsplit(adverbs_optional_list, ':') do
				normalized_option = adverbs_optional_aliases[option]
				if not normalized_option then
					error('unrecognized adverb opt= argument: "' .. option .. '"')
				end
				normalized_option_romaji = kana_to_romaji(normalized_option, data.lang_code)
				normalized_option_link = adverbs_optional_links[normalized_option]
				inflected_forms = replace_suffix('', {normalized_option_link}, '', {' [[' .. normalized_option_romaji .. ']]'})
				insert_form('optionally as', inflected_forms[1])
				if cat_suffix then
					insert(data.headword.categories, data.lang_name .. " " .. cat_suffix .. " optionally taking " .. normalized_option .. "-" .. normalized_option_romaji)
				end
			end

		elseif inflection_type == 'irr' then
			insert(data.info_mid, 'irregular')
			if cat_suffix then
				insert(data.headword.categories, data.lang_name .. " irregular " .. cat_suffix)
			end
		elseif inflection_type == '-' or inflection_type == 'un' then
			insert(data.info_mid, 'uninflectable')
		end
	--elseif data.lang_code == 'ryu' then ...
	end
end

local function add_categories(data)
	local lang_name = data.lang_name
	local pagename = data.pagename
	local tc = data.headword.categories
	
	-- adds category [langname] terms spelled with jōyō kanji or [langname] terms spelled with non-jōyō kanji
	-- (if it contains any kanji)
	local number_of_kanji = 0
	for c in ugmatch(pagename, "[" .. range.kanji .. "々〻]") do
		number_of_kanji = number_of_kanji + 1
		if c ~= "々" and c ~= "〻" then -- Not a kanji for the purposes of categorisation.
			insert(tc, (lang_name .. " terms spelled with %s kanji"):format(en_grades[m_ja.kanji_grade(c)]))
		end
	end
	
	-- categorize by number of kanji
	if number_of_kanji ~= 0 then
		insert(tc, (lang_name .. " terms with %s kanji"):format(number_of_kanji))
		-- single-kanji terms
		if ulen(pagename) == 1 then
			insert(tc, lang_name .. " terms spelled with " .. pagename)
			insert(tc, lang_name .. " single-kanji terms")
		end
	end
	
	-- categorize by the script of the pagename or specific characters contained in it
	-- if pagename is hiragana or katakana 
	if detect_pagename_kana(data, true) == 'hira' then insert(tc, lang_name .. " hiragana") end
	if detect_pagename_kana(data, true) == 'kata' then insert(data.katakana_category, lang_name .. " katakana") end
	local p, n = ugsub(pagename, '[' .. range.kana .. range.kanji .. range.ideograph .. range.kana_graph .. range.punctuation .. ']+', '')
	if p ~= '' and n > 0 then insert(tc, lang_name .. " terms written in multiple scripts") end
	
	local pos = data.headword.pos_category
	local rare_chars = {}
	for ch in iterate_rare_chars(pagename) do
		rare_chars[ch] = true
	end
	-- Categorise yōon, but exclude kana and mora entries, since they can't be spelled with themselves.
	-- FIXME: allow kana categories for morae.
	if not (pos == "syllables" or pos == "kana" or pos == "morae") then
		for _, mora in ipairs(moraify((ugsub(pagename, "[^" .. range.kana .. "]+", " ")))) do
			if not mora:gsub(" +", ""):match("^.?[\128-\191]*$") then
				rare_chars[mora] = true
			end
		end
	end
	for ch in pairs(rare_chars) do
		insert(tc, lang_name .. " terms spelled with " .. ch)
	end
	
	if (
		pos ~= "proverbs" and
		pos ~= "phrases" and
		umatch(ugsub(pagename, "[" .. range.katakana .. "]+", ""), "[" .. range.hiragana .. "]") and
		umatch(ugsub(pagename, "[" .. range.hiragana .. "]+", ""), "[" .. range.katakana .. "]")
	) then
		insert(tc, lang_name .. " terms spelled with mixed kana")
	end
end

pos_functions["verbs"] = function(args, data)
	add_transitivity(data, args["tr"])
	add_inflections(data, args["infl"], 'verbs')
end

pos_functions["suffixes"] = function(args, data)
	add_inflections(data, args["infl"])
end

pos_functions["auxiliary verbs"] = function(args, data)
	insert(data.headword.categories, data.lang_name .. " auxiliary verbs")
	add_inflections(data, args["infl"])
	data.headword.pos_category = "verbs"
end

pos_functions["suru verbs"] = function(args, data)
	add_transitivity(data, args["tr"])
	add_inflections(data, 'suru', 'verbs')
	data.headword.pos_category = "verbs"
end

pos_functions["adjectives"] = function(args, data)
	add_inflections(data, args["infl"], 'adjectives')
end

pos_functions["nouns"] = function(args, data)
	-- the counter (classifier) parameter, only relevant for nouns
	local counter = args["count"] or ""

	if counter == "-" then
		insert(data.headword.inflections, {label = "uncountable"})
	elseif counter ~= "" then
		insert(data.headword.inflections, {label = "counter", counter})
	end
end

pos_functions["adverbs"] = function(args, data)
	local opt = args["opt"]
	if opt then
		opt = adverbs_optional_tag .. opt
	end
	add_inflections(data, opt, 'adverbs')
end

--[==[
Generate categories by pagename, also optionally by POS
Also for use in soft redirect pages ([[Module:ja-see]]).
Sortkey is not provided.
data = {
	pagename = ..., -- (required)
	lang = ..., -- (required) language object
	categories = {}, -- (required) receive categories
	katakana_category = {}, -- (required) receive katakana-sorted categories
	pos = ..., "noun", "verb", etc. no POS categories if not given
}
]==]
function export.cat(data)
	data.lang_name = data.lang:getCanonicalName()
	data.pagename_kana = detect_pagename_kana(data)
	
	if data.pos then
		local pos = data.pos:gsub('x$', 'xe') .. 's'
		insert(data.categories, data.lang_name .. ' ' .. pos)
		insert(data.categories, data.lang_name .. ' ' .. require'Module:headword'.pos_lemma_or_nonlemma(pos, true) .. 's')
	end
	
	data.headword = { categories = data.categories }
	add_categories(data)
end

--[==[
The main entry point.
This is the only function that can be invoked from a template.
]==]
function export.show(frame)
	local poscat = frame.args[2] or frame.args[1] or error("Part of speech has not been specified. Please pass parameter 1 to the module invocation.")
	
	local params = {
		[1] = {list = true},
		['rom'] = {list = true, allow_holes = true, separate_no_index = true},
		['head'] = {list = true, allow_holes = true, separate_no_index = true},
		['label'] = {list = true, allow_holes = true},
		['hist'] = {list = true}, ['hhira'] = {alias_of = 'hist'}, ['hkata'] = {alias_of = 'hist'},
		['tr'] = {},
		['infl'] = {}, ['type'] = {alias_of = 'infl'}, ['decl'] = {alias_of = 'infl'},
		['opt'] = {},
		['count'] = {},
		['sort'] = {},
		['pagename'] = {},
	}
	-- For backwards compatibility with uses of {{ja-syllable}} with the script parameter.
	if poscat == "syllables" then
		params["sc"] = {}
	end
	
	local args = require('Module:parameters').process(frame:getParent().args, params)

	local data = {
		headword = {
			pos_category = poscat,
			categories = {},
			heads = {},
			no_redundant_head_cat = true,
			inflections = {},
			genders = {'m'}, -- placeholder
			nogendercat = true
		},
		--custom info
		pagename = args.pagename or mw.loadData("Module:headword/data").pagename,
		pagename_kana = nil, -- "hira" "kata" "both", nil
		lang_code = frame.args[1],
		lang_name = nil, -- "Japanese", "Okinawan" ...
		katakana_category = {},
		info_mid = {}, -- "godan", "intransitive" ...
		info_hist = {}, -- historical kana
		inflection_base = {}, -- base of inflections
		kanas = {}, -- kana id
	}
	data.headword.lang = require("Module:languages").getByCode(data.lang_code)
	data.lang_name = data.headword.lang:getCanonicalName()
	
	-- sort out all the kanas and do the romanization business
	format_headword(args, data)

	-- add certain inflections and categories for adjectives, verbs, nouns, or adverbs
	if pos_functions[poscat] then
		pos_functions[poscat](args, data)
	end
	
	-- categories
	add_categories(data)
	local sort_base = args.sort or data.kanas[1] or data.pagename
	data.headword.sort_key = data.headword.lang:makeSortKey(sort_base)
	
	local katakana_category = #data.katakana_category > 0 and
		require("Module:utilities").format_categories(
			data.katakana_category,
			data.headword.lang,
			nil,
			sort_base,
			nil,
			require("Module:scripts").getByCode("Kana")
		) or ""
	
	-- output
	local i_kanas = 0
	return katakana_category .. require('Module:headword').full_headword(data.headword):gsub('<span class="gender">.-</span>', function()
		return (#data.info_hist > 0 and '<sup>←' .. concat(data.info_hist, ' or ') .. '<sup>[[w:Historical kana orthography|?]]</sup></sup>' or '') .. ('<i>' .. concat(data.info_mid, '&nbsp;') .. '</i>')
	end):gsub('<strong .->.-</strong>', function(m0)
		i_kanas = i_kanas + 1
		if data.kanas[i_kanas] then
			return m0
		end
	end):gsub('<span class="headword%-tr tr" dir="ltr"><span class="Latn" lang="ja">', '<span lang="ja-Latn" class="headword-tr tr Latn" dir="ltr">'):gsub('</span></span>', '</span>')
end

return export