-- texdoclib-score.tlu: scoring functions for texdoc
--
-- The TeX Live Team, GPLv3, see texdoclib.tlu for details
-- dependencies
local md5 = require 'md5'
local texdoc = {
const = require 'texdoclib-const',
util = require 'texdoclib-util',
config = require 'texdoclib-config',
}
-- shortcuts
local M = {}
-- shared variables
local global_adjscore, spec_adjscore = {}, {}
------------------------- configuration directives -------------------------
-- set key in score table to val, without overriding
local function set_score_table(tab, key, val)
local k = string.lower(key)
local v = tonumber(val)
if v then
if tab[k] == nil then tab[k] = v end
return true
end
return false
end
-- interpret a confline as a score directive or return false
function M.confline_to_score(line)
local keyw, pat, val
-- try global adjscore
pat, val = string.match(line, '^adjscore%s+([%w%p]+)%s*=%s*([%d+-.]+)')
if pat and val then
return set_score_table(global_adjscore, pat, val)
end
-- try keyword specific adjscore
keyw, pat, val = string.match(line,
'^adjscore%(([%w%p]+)%)%s+([%w%p]+)%s*=%s*([%d+-.]+)')
if keyw and pat and val then
keyw = string.lower(keyw)
spec_adjscore[keyw] = spec_adjscore[keyw] or {}
return set_score_table(spec_adjscore[keyw], pat, val)
end
return false
end
---------------------------- score computation -----------------------------
-- parse filename into , ,
local function parse(filename)
local base, lang, ext
ext = texdoc.util.get_ext(filename)
if ext ~= nil and ext ~= '' then
base = filename:sub(1, -#ext - 2)
else
base = filename
end
for lc, _ in pairs(texdoc.const.lang_codes) do
local hyph_lc = '-' .. lc
if base:sub(-#hyph_lc) == hyph_lc then
return base:sub(1, -#hyph_lc - 1), lc, ext
end
end
return base, lang, ext
end
-- says if pat is a "subword" of str
local function is_subword(str, pat)
local function is_delim(str, i)
return not not string.find(string.sub(str, i, i), '%p')
end
local i, j = string.find(str, pat, 1, true)
return not not (i and j
and (i == 1 or is_delim(str, i) or is_delim(str, i-1))
and (j == #str or is_delim(str, j) or is_delim(str, j+1)))
end
-- says if a filename has a bad basename
local function has_bad_basename(file)
file = file:gsub('.*/', '')
for _, b in ipairs(texdoc.config.get_value('badbasename_list')) do
if file:find('^' .. b .. '$') or file:find('^' .. b .. '%.') then
return true
end
end
return false
end
-- compute a pattern score -10 <= s < 10
local function pattern_score(name, pat, dbg_score)
dbg_score('Start heuristic scoring with pattern: ' .. pat)
-- score management
local score = -10
local function upscore(s, reason, force)
if s > score or force then
score = s
dbg_score('New heuristic score: %.1f. Reason: %s', s, reason)
end
end
-- look for exact or subword match
if M.is_exact(name, pat) then
upscore(4, 'exact match')
elseif is_subword(name, pat) then
upscore(1, 'subword match')
end
-- try derivatives unless pat contains a slash
local slash = not not string.find(pat, '/', 1, true)
if not slash then
for _, suffix in ipairs(texdoc.config.get_value('suffix_list')) do
local deriv = pat .. suffix
if M.is_exact(name, deriv) then
upscore(4.5, 'exact match for derived pattern: ' .. deriv)
elseif is_subword(name, deriv) then
upscore(3.5, 'subword match for derived pattern: ' .. deriv)
end
end
end
-- if extension is bad, score becomes an epsilon
local ext = texdoc.config.get_value('ext_list')[M.ext_pos(name)]
if ext and texdoc.config.get_value('badext_list_inv')[ext] and score > 0 then
upscore(0.1, 'bad extension', true)
end
-- if basename is bad, score becomes an epsilon
if has_bad_basename(name) and score > 0 then
upscore(0.1, 'bad basename', true)
end
-- bonus for being in the right directory
if string.find('/' .. name, '/' .. pat .. '/', 1, true) and not slash then
upscore(score + 1.5, 'directory bonus')
end
-- done
dbg_score('Final heuristic score: %.1f', score)
return score
end
-- set the score of a docfile
local function set_score(df, original_kw)
-- scoring is case-insensitive (patterns are already lowercased)
local name = string.lower(df.normname)
local df_id = string.sub(md5.sumhexa(name), 1, 7)
-- special debugging function
local function dbg_score(msg, ...)
-- add the hash id prefix to make the outputs grep-friendly
local msg = string.format('(%s) ', df_id) .. msg
texdoc.util.dbg_print('score', msg, ...)
end
dbg_score('Start scoring ' .. df.realpath)
dbg_score('Name used: ' .. name)
-- get score from patterns
local score = -10
local is_alias = false
for _, pat in ipairs(df.matches) do
local s = -10
local p = string.lower(pat.name)
if pat.original then -- non-alias
if df.tree > -1 then
s = pattern_score(name, p, dbg_score)
else
s = 1
end
elseif M.is_exact(name, p) then -- alias
is_alias = true
local bonus, note = 0, ''
if pat.locale then
bonus, note = 5, ', (language-based)'
end
s = (pat.score or 10) + bonus -- default alias score is 10
dbg_score('Matching alias "%s", score: %.1f%s', pat.name, s, note)
end
if s > score then score = s end
end
dbg_score('Max pattern score: %.1f', score)
-- get score from tlp associations
if score == -10 and df.tlptodoc then
score = -1
dbg_score('New score: %.1f from package name association', score)
end
if score == -10 and df.runtodoc then
score = -5
dbg_score('New score: %.1f from sty/cls association', score)
end
-- bonus for metadata
if df.details then
if string.find(string.lower(df.details), 'readme') then
score = score + 0.1
dbg_score('Catalogue "readme" bonus: +0.1')
else
score = score + 1.5
dbg_score('Catalogue details bonus: +1.5')
end
end
-- bonus for locale
local config_lang = texdoc.config.get_value('lang')
if not is_alias then
local file_lang
-- from its catalogue
if df.lang then
-- take first two letters; it may have country codes
file_lang = df.lang:sub(1, 2)
end
-- from its filename
if not file_lang then
_, file_lang, _ = parse(name)
file_lang = texdoc.const.lang_codes[file_lang]
end
if config_lang ~= nil and config_lang == file_lang then
score = score + 1
dbg_score('Locale match bonus: +1.0')
elseif file_lang ~= nil and file_lang ~= 'en' then
-- normally, english documents do not have file_lang,
-- but sometimes catalogue includes en info (e.g., geometry)
-- we want to treat both cases similar
score = score - 0.1
dbg_score('Locale unmatch: -0.1')
end
end
-- adjust from keyword-specific tables
if df.tree > -1 and spec_adjscore[original_kw] then
for pat, val in pairs(spec_adjscore[original_kw]) do
if val and is_subword('/' .. name, pat) then
score = score + val
dbg_score('Adjust by %.1f from specific pattern "%s"', val, pat)
end
end
end
-- adjust from global tables
if df.tree > -1 then
for pat, val in pairs(global_adjscore) do
if val and is_subword('/' .. name, pat) then
if score > -10 or val < 0 then score = score + val end
dbg_score('Adjust by %.1f from global pattern "%s"', val, pat)
end
end
end
dbg_score('Final score: %.1f', score)
-- the final score should be a float value
df.score = score + 0.0
end
-- set the scores for a doclist
local function set_list_scores(list, original_kw)
for _, df in ipairs(list) do
set_score(df, original_kw)
end
end
-- says if filename is an exact match for pat
function M.is_exact(filename, pattern)
local f_base, f_lang, f_ext = parse(filename)
local p_base, p_lang, p_ext = parse(pattern)
-- if the pattern contains lang, check if identical
if p_lang ~= nil and f_lang ~= p_lang then
return false
end
-- if the pattern contains ext, check if identical
if p_ext ~= nil and p_ext ~= '' and f_ext ~= p_ext then
return false
end
-- finally check the bases
if (f_base == p_base
or (f_base:sub(-#p_base) == p_base
and f_base:sub(-#p_base - 1, -#p_base - 1) == '/')) then
return true
else
return false
end
end
-- compare two docfile's: (see texdoclib-search.tlu for structure)
-- 1. by score
-- 2. then by extensions (ordered as in ext_list),
-- 3. then lexicographically by normname.
-- 4. then by tree.
-- return true if a is better than b
local function docfile_order(a, b)
if a.score > b.score then return true
elseif a.score < b.score then return false
elseif a.ext_pos < b.ext_pos then return true
elseif a.ext_pos > b.ext_pos then return false
elseif a.normname < b.normname then return true
elseif a.normname > b.normname then return false
else return (a.tree > b.tree)
end
end
----------------------------- public functions -----------------------------
-- returns the index of the most specific extension of file in ext_list,
-- or config.ext_list_max + 1
function M.ext_pos(filename)
-- remove zipext if applicable
filename = texdoc.util.parse_zip(filename)
-- now find the extension
local p, e, pos, ext
for p, e in ipairs(texdoc.config.get_value('ext_list')) do
if (e == '*') and (ext == nil) then
pos, ext = p, e
elseif (e == '') and not filename:find('.', 1, true) then
pos, ext = p, e
elseif filename:sub(-e:len() - 1) == '.' .. e then
if (ext == nil) or (ext == '*') or (e:len() > ext:len()) then
pos, ext = p, e
end
end
end
return pos or (texdoc.config.get_value('ext_list_max') + 1)
end
-- return the "quality" of docfile
function M.docfile_quality(df)
if df.score > 0 then
return 'good'
elseif df.score > -100 then
return 'bad'
else
return 'killed'
end
end
-- sort a doclist
function M.sort_doclist(dl, original_kw)
dl:stop()
set_list_scores(dl, original_kw)
table.sort(dl, docfile_order)
end
return M
-- vim: ft=lua: