#!/bin/sh # Drives wget when wanting to grab part of a webpage hierarchy from a remote # server. I found a simple wrapper necessary because there are a few too many # options to remember. # # Please be considerate in the way you hammer webservers with this script. # # Iain Murray, November 2006 set -e if [ $# -ne 1 ] ; then echo usage: `basename $0` base_url echo echo `basename $0` will drive wget to grab base_url and all material linked to in echo its directory or child directories. The pages are dumped into your echo current directory, which must be empty. Links in html pages are converted so echo they should still work when using the local copy. echo exit 1 fi if [ `ls -A | wc -l` -ne 0 ] ; then echo I kinda expected to be run in an empty directory. Chickening out... exit 1 fi LEVELS=`echo -n "$1" | sed -e 's/^[a-z]*:\/\///' -e 's/\/$//' -e 's/[^/]//g' | wc -c` date > SOURCE echo snaffle "$1" >> SOURCE echo which ran >> SOURCE echo wget -r -np -p -k -nH --cut-dirs="$LEVELS" "$1" >> SOURCE wget -r -np -p -k -nH --cut-dirs="$LEVELS" "$1" date >> SOURCE # consider using --no-cache # Options I like: # -r recursive # -np no-parent, stops wget trying to download whole website # -p page requisites (so always get all the images and stuff) # -k convert links so they will actually work on my computer # -nH I don't want a directory with the host name created # --cut-dirs This is set so that child directories will be created but the level # given in the url will be saved in '.' rather than a zillion # sub-directories deep.