#!/bin/env python ''' If you use Fedora on more than one machine at your site, it is worth to host your own copy of the main repository, simply by copying all the RPM files in one directory, and run "createrepo --update --database ." in that directory. The best way to keep this repository up to date is to use rsync against one of the available Fedora mirror. If you update on a regular basis, say weekly, it takes only a few minutes each time. The problem is to do the initial download: more than twelve thousand files and 17 GiB for Fedora 10 ! Typically, mirrors throttle each connection, which means that it could take several days to download the entire repository. In order to speed this up, and spread the load across several mirrors, this script rsync's only a certain number of files from each mirror. In this initial version the files are split according to the first character of their name. This is far from an ideal solution since the number of files differs greatly depending on the first character of their name, for example for Fedora 10, there were 4 RPM files starting with 'A' but over 2000 starting with 'p'. Size of files also varies greatly from about 2 KiB to 360 MiB, rendering this algorithm even more inadequate. I'm looking into better schemes based both on size and number of files. If you have ideas around this (and I mean something more than "oh just use balance trees"), drop me a note at yves@zioup.com. Bugs / comments: -I did think of creating a permanent list (because you don't have enough dot files in your home directory :-), but the reliability and speed of the different sites hosting mirrors for Fedora seem very inconsistent, a site that works well and is fast one day might not answer, or worst lock up this script another day, and vice-versa. -This script alleviate the load on the servers, but... it means that you will have up to 62 rsync processes going at once on your machine. The bottle neck always seem to be the servers, the client side has never been an issue for me on my machine, and we're talking a little VIA C7 with a SATA disk from 2007 here. You've been warned ! Copyright Yves Dorfsman, Calgary, 2008. This is free software. You can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. ''' def createUniqueLogfile(logdir, url): ''' Create a file with a unique name derived from the url, and return the file name. We close the file but do not delete it to let the sub-process re-use it (we have to create it to guarantee uniqueness). We don't use tempfile because it create ugly file names. ''' import re import os suffixes = range(2, 100) suffixes = [ '.' + str(l).zfill(3) for l in suffixes ] suffixes.insert(0, '') try: basename = re.split('rsync://', url)[1] except: # url is malformed, although we should never run this code because # the url should have been verified already. return '' # clean up (get rid of trailing slashes, double slash etc..) basename = os.path.normpath(basename) # file names can't have slashes. basename = basename.replace('/', '..') basename = logdir + '/' + basename basename = os.path.normpath(basename) for suffix in suffixes: logfile = basename + suffix + '.log' try: fd = os.open(logfile, os.O_WRONLY | os.O_CREAT | os.O_EXCL) except: continue else: os.close(fd) return logfile def verifyMirror(m_url): ''' Run a simple rsync against the url, to check if it returns a directory listing including a line drwxrwxrwx .......... releases If it does not work we check the url + linux/. We terminates all our url with a slash because rsync interpretation (give the content of that directory). We return an empty string if we don't obtain the results we are looking for. ''' import re import commands ################################################## # code to test when offline # import random # if random.choice((True, False)): # return m_url # else: # return '' ################################################## # Some link point directly to the linux directory, some don't if not m_url.endswith('/'): m_url += '/' a = m_url b = a + 'linux/' for new_url in (a, b): # we use the timeout to avoid getting stuck while looking for vali # servers. cmd = 'rsync --timeout=7 ' + new_url result = commands.getstatusoutput(cmd) # If rsync did not work, this mirror is not valid, we do not need to # test with linux/ at the end if result[0] != 0: return '' # split the lines into a list of lines result = result[1].split('\n') m = [ l for l in result if re.match('dr.....r.. .*releases', l) ] if len(m) == 1: return new_url # if we got here, it means we never found releases return '' def main(version, architecture, repository, debug=False, logdir='/tmp'): ''' Takes three mandatory arguments: -version of Fedora (8, 9, 10 etc...) -architecture (i386, x86_64, ppc, ia64, sparc) -repository is on of "os" "Everything", "updates" and two optional: -debug mode -directory where to put the logfiles (default '/tmp') main returns a tuple composed of a int and a string. The int is zero on successful completion, and non-zero otherwise, and the string will contain a brief explanation of the problem. IT NEEDS TO BE RUN FROM THE PACKAGES DIRECTORY. >>>> It "downloads" everything in the current working directory. <<<< ''' import random import urllib import os.path import tempfile if repository != "os" and repository != "Everything" and repository != "updates": return (1, 'repostiory needs to be one of "os", "Everything", or "updates"') # confirm version is an integer, than convert to string try: version = int(version) except: return(1, 'version needs to be an integer') version = str(version) if debug: print "\nChecking log directory exists and is writable..." if not os.path.isdir(logdir): return(1,'"' + logdir + '" is not a directory.') # the default number of files tempfile will try is ridiculous, so we adjust tempfile.TMP_MAX = 49 try: tempfd = tempfile.NamedTemporaryFile(dir=logdir) except: return(1, 'Could not create a temporary file. Is "' + logdir + '" writeable ?') tempfd.close() if debug: print '"' + logdir + '"', "is writeable." # url = 'file:index.html' # for offline tests. url = 'http://mirrors.fedoraproject.org/publiclist/Fedora' url += '/' + version url += '/' + architecture url += '/' if debug: print '\nfetching "' + url + '"\n' try: f = urllib.urlopen(url) except: msg = 'Could not fetch ' + url return(1, msg) # simplistic scrubbing... I like my soup hot, not beautiful... mirrors = [ le for le in f.readlines() if le.find('rsync://') >= 0 ] mirrors = [ le.split('"') for le in mirrors ] mirrors = zip(*mirrors)[1] # zip gives us a tuple, we need to be able to remove elements mirrors = list(mirrors) # We a way to mark bad mirrors. By default we assume they are all bad. mirrors = [ [ le, False ] for le in mirrors ] if debug: print 'found ' + str(len(mirrors)) + ' rsync mirrors.\n' # The mirrors are listed in alphabetical order, so we would hit the # same guys every time, not fair... random.shuffle(mirrors) # We need to check which mirrors work at this point in time for i, site in enumerate(mirrors): if debug: print 'verifying:\n' + site[0] new_url = verifyMirror(site[0]) if new_url == '': if debug: print 'marking as bad.\n' else: if debug: print 'good, now using:\n' + new_url + '\n' mirrors[i][0] = new_url mirrors[i][1] = True # Cleaning up if debug: print 'Removing bad mirrors...' mirrors = [ le for le in mirrors if le[1] ] if len(mirrors) == 0: return(1, 'Could not find any working rsync mirror.') if debug: print str(len(mirrors)) + ' validated rsync mirrors\n' suffix = { 'os': 'releases/' + version + '/Fedora/' + architecture + '/os/Packages/', 'Everything': 'releases/' + version + '/Everything/' + architecture + '/os/Packages/', 'updates': 'updates/' + version + '/' + architecture + '/' } chars = [ chr(l) for l in range(0x30, 0x3a) ] # 0 - 9 chars += [ chr(l) for l in range(0x41, 0x5b) ] # A - Z chars += [ chr(l) for l in range(0x61, 0x7b) ] # a - z idx = 0 for char in chars: count = 0 done = False while not done: count += 1 if mirrors[idx][1]: badurl = False logfile = createUniqueLogfile(logdir, mirrors[idx][0]) if logfile == '': # there is a problem with this url, probably malformed badurl = True else: if debug: print 'rsync\'ing ' + mirrors[idx][0] cmd = 'nohup rsync -vut0 ' + mirrors[idx][0] + suffix[repository] + char + '* . >' + logfile + ' 2>&1 &' result = os.system(cmd) # did that work ? if not we mark it as bad. if result != 0: badurl = True if debug: print 'got an error from: ' + mirrors[idx][0] if badurl: mirrors[idx][1] = False if debug: print 'removing "' + mirrors[idx][0] + '" from the list.\n' else: done = True idx += 1 if idx == len(mirrors): idx = 0 if count == len(mirrors): # we've tried all the mirrors, none are valid return (1, 'Could not contact any mirror.') return (0, '') import sys import optparse if __name__ == '__main__': parser = optparse.OptionParser(usage='%prog [-l logdir] [-d] -v vers -a arch -r repo\n\nTHIS WILL DOWNLOAD ALL THE PACKAGES IN THE CURRENT WORKING DIRECTORY\n\n') parser.add_option('-v', dest='version', type='int', help='Fedora version, e.g.: "-v 9"' ) parser.add_option('-a', dest='architecture', type='string', help='architecture, e.g.: "-a i386". Other choices at time of writing were "x86_64", "ppc", "ia64" and "sparc".' ) parser.add_option('-r', dest='repository', type='choice', choices=("os", "Everything", "updates"), help='which repository, e.g.: "-r os". The repository has to be one of "os", "Everything" or "updates".' ) parser.add_option('-d', dest='debug', action='store_true', default=False, help='debug, makes it more verbose.' ) parser.add_option('-l', dest='logdir', type='string', default='/tmp', help='directory where the log files are going to be created. The directory must be already existing. if this option is not specified, "/tmp" will be used.' ) (o, args) = parser.parse_args() if len(args) != 0: parser.error('No arguments should be used.\n') sys.exit(1) if o.version == None or o.architecture == None or o.repository == None: parser.error('version, architecture and repository needs to be specified.\n') #parser.print_help() sys.exit(1) returned = main(o.version, o.architecture, o.repository, o.debug, o.logdir) if returned[0] != 0: print >> sys.stderr, returned[1] sys.exit(returned[0])