It takes 50 lines.
-- Print out the top k most common words in a given input file.
-- Print equally common words in alphabetical order.
-- Never print more than k words.
--
-- Knuth's famous program in idiomatic Lua for a not-deathly-slow computer.
--
-- Jon Bentley's problem statement:
-- a user should be able to find the 100 most frequent words in a
-- twenty-page technical paper (roughly a 50KB file) without undue
-- emotional trauma.
--
-- Arguably Knuth's original data structures inflict "undue emotional trauma"
-- given the slowest computer available in the year 2025.
if #arg == 0 or #arg > 2 then
print('usage: lua common_words.lua [number of words to print]')
exit(1)
end
if #arg == 2 then
k = tonumber(arg[2])
else
k = 100
end
function main()
local freq = read_freq(arg[1]) -- read input_file into a dictionary of frequency counts
local words = sort_freq(freq) -- sort the dictionary by frequency
print_words(k, words, freq) -- output the results
end
function read_freq(input_file)
local result = {}
for line in io.lines(input_file) do
for word in line:gmatch('%a+') do
word = word:lower()
if result[word] == nil then
result[word] = 0
end
result[word] = result[word] + 1
end
end
return result
end
function sort_freq(h)
local result = {}
for k in pairs(h) do
table.insert(result, k)
end
table.sort(result, function(a, b)
if h[a] == h[b] then
return a < b -- words with same frequency in alphabetical order
end
return h[a] > h[b]
end)
return result
end
function print_words(k, words, freq)
for i = 1, math.min(k, #words) do
print(words[i], freq[words[i]])
end
end
main()
-- The text of Moby Dick (1.2MB) takes 0.1s to run on a 2.4GHz 2021 AMD
-- processor.
The original article from 1986.
Proximal inspiration.