So I've finally written my first at least slightly useful Erlang module,
fetch.
fetch is an url-fetcher that can download the content of a url or a list of urls specified in a file (one url per line):
-module(fetch).
-export([fetch_url/1, fetch_urls_parallel/1,
fetch_urls_sequential/1, main/1]).
-import(lib_misc, [pmap/2]).
% Returns {Url, content}
fetch_url(Url) ->
inets:start(),
case http:request(Url) of
{ok, {_, _, Body }} ->
{Url, Body};
% Error, return empty string
{_, _} -> {Url, []}
end.
% Reads the file File name and returns a list with one item per line
read_lines(Filename) ->
{ok, F} = file:read_file(Filename),
string:tokens(binary_to_list(F), "\n").
% Fetch content of all urls in list parallel to list of format [{Url1, "content"}]
fetch_urls_parallel(L) ->
pmap(fun(X) -> fetch_url(X) end, L).
% Same as above but sequential
fetch_urls_sequential(L) ->
lists:map(fun(X) -> fetch_url(X) end, L).
% Usage fetch:main(filename)
main([A]) ->
% Read the file
L = read_lines(A),
% Download content
R = fetch_urls_parallel(L),
% Print each url and content size
lists:foreach(fun({U,C}) -> io:format("url:~s ~w ~n", [U,length(C)]) end, R),
% Stop and exit
init:stop().
Start with
diaspora:~/uri_test$ erlc fetch.erl; erl -noshell -s fetch main uris3.txt
(lib_misc need to be in your Erlang path)
The interesting part is
fetch_urls_parallel that uses the
lib_misc:pmap function from Joe Armstrong's Erlang book to fetch all urls in the list
L in parallel. This can be contrasted with
fetch_urls_sequential that fetches each url one by one using normal
lists:map.
For fun I decided to write something similar in Ruby and compare the speed. Here's the Ruby code:
require 'open-uri'
require 'timeout'
threads = []
File.open(ARGV[0]) do |f|
f.each do |l|
c = ""
threads << Thread.new do
begin
Timeout::timeout(20) do
open(l) do |response|
c = response.read
end
end
rescue Exception => e
ensure
STDOUT.printf "%s %d\n", l[0...-1], c.size
STDOUT.flush
end
end
end
end
threads.each {|t| t.join}
Using a file with 100 urls and running each program 100 times I found that the Erlang code was almost 50% faster than the Ruby code! Not really sure why, but I suspect that open-uri plays a big part.