pandoc / lua-filters

A collection of lua filters for pandoc
MIT License
611 stars 166 forks source link

Add jgm's proof-of-concept Wikipedia lua-filter #120

Closed the-solipsist closed 3 years ago

the-solipsist commented 4 years ago

Hi, in this thread from May 2020, @jgm developed a proof-of-concept filter for Wikipedia content (which he notes can be improved). I think it might be worth adding to this collection for greater visibility as well as for improvements from other people.

-- Remove fallback images and use highest-resolution thumbnail
-- if a srcset is given. It would be ideal to use svg but for
-- some reason wikipedia returns html pages when these are requested.
function Image(el)
if el.classes:find("mwe-math-fallback-image-inline") then
return {}
end
-- Don't fetch from /static
if el.src:match("^/static") then
return {}
end
-- Don't bother with fake images
if el.attributes.width == "1" then
return {}
end
-- Find largest image in srcset
local src = el.src
if el.attributes.srcset then
local small, med, large = el.attributes.srcset:match("(//%g+)")
src = large or med or small or el.src
end
-- el.src = el.src:gsub("%.svg.*", ".svg")
-- Add https://
if src:match("^//") then
src = "https:" .. src
end
el.src = src
return el
end

-- Remove backlinks and edit links
function Span(el)
if el.classes:find("mw-editsection") or
el.classes:find("mw-cite-backlink") or
el.classes:find("mw-jump-link") then
return {}
end
end

function Div(el)
-- Remove navigation parts
if el.attributes.role == "navigation" then
return {}
end
-- Put figures in figure environment with caption
if el.classes:find("thumb") and FORMAT == "latex" then -- figure
return { pandoc.RawBlock("latex", "\\begin{figure}"),
el,
pandoc.RawBlock("latex", "\\end{figure}") }
end
if el.classes:find("thumbcaption") and FORMAT == "latex" then
return pandoc.Plain({
pandoc.RawInline("latex","\\caption"),
pandoc.Span(pandoc.utils.blocks_to_inlines(el.content))})
end
end

-- Make links absolute so they work in PDFs.
function Link(el)
if el.classes:find("image") then
return el.content
end
if el.target:match("^/") then
el.target = "https://en.wikipedia.org" .. el.target
return el
end
-- Ignore empty links.
if #(el.content) == 0 then
return {}
end
end

local math_symbols =
{ ["⊆"] = "\\subseteq",
["∗"] = "\\star",
["∈"] = "\\in",
["↦"] = "\\mapsto",
["⊂"] = "\\subset",
["∪"] = "\\cup" }

-- Fix math not marked up as such
function Str(el)
local replacement = math_symbols[el.text]
if replacement then
return pandoc.Math("InlineMath", replacement)
end
end

-- Return the mw-content-text Div
local function find_content(els)
if els then
for _,b in ipairs(els) do
if b.t == 'Div' then
if b.identifier == "mw-content-text" then
return b
elseif b.content then
local inner = find_content(b.content)
if inner then
return inner
end
end
end
end
end
end

-- Isolate the content
function Pandoc(el)
local main = find_content(el.blocks)
el.blocks = { main }
return el
end
alerque commented 4 years ago

It's a lot easier to do code reviews in a PR than in an issue with a code block.

tarleb commented 3 years ago

I'm closing this, as I'm not interested in documenting and maintaining this filter. However, if somebody would do the extra preparatory work, promises to stick around to maintain it, and opens a PR, then we'd probably merge it.