examples/retriever.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78

#! /usr/bin/env python
# -*- coding: iso-8859-1 -*-
# vi:ts=4:et
# $Id$

import sys, threading, Queue
import pycurl

# We should ignore SIGPIPE when using pycurl.NOSIGNAL - see
# the libcurl tutorial for more info.
try:
    import signal
    from signal import SIGPIPE, SIG_IGN
    signal.signal(signal.SIGPIPE, signal.SIG_IGN)
except ImportError:
    pass


class WorkerThread(threading.Thread):
    def __init__(self, queue):
        threading.Thread.__init__(self)
        self.queue = queue

    def run(self):
        while 1:
            try:
                url, filename = self.queue.get_nowait()
            except Queue.Empty:
                raise SystemExit
            f = open(filename, "wb")
            curl = pycurl.Curl()
            curl.setopt(pycurl.FOLLOWLOCATION, 1)
            curl.setopt(pycurl.MAXREDIRS, 5)
            curl.setopt(pycurl.URL, url)
            curl.setopt(pycurl.WRITEDATA, f)
            curl.setopt(pycurl.NOSIGNAL, 1)
            curl.setopt(pycurl.CONNECTTIMEOUT, 30)
            curl.setopt(pycurl.TIMEOUT, 300)
            try:
                curl.perform()
            except:
                import traceback
                traceback.print_exc(file=sys.stderr)
                sys.stderr.flush()
            curl.close()
            f.close()
            sys.stdout.write(".")
            sys.stdout.flush()

# Read list of URLs from file specified on commandline
try:
    urls = open(sys.argv[1]).readlines()
    num_workers = int(sys.argv[2])
except:
    # File or number of workers was not specified, show usage string
    print "Usage: %s <file with URLs to fetch> <number of worker threads>" % sys.argv[0]
    raise SystemExit

# Initialize thread array and the file number used to store documents
threads = []
fileno = 0
queue = Queue.Queue()

# Fill the work input queue with URLs
for url in urls:
    fileno = fileno + 1
    filename = "doc_%d" % (fileno,)
    queue.put((url, filename))

# Start a bunch of threads
for num_threads in range(num_workers):
    t = WorkerThread(queue)
    t.start()
    threads.append(t)

# Wait for all threads to finish
for thread in threads:
    thread.join()