#!/bin/sh

# Drives wget when wanting to grab part of a webpage hierarchy from a remote
# server. I found a simple wrapper necessary because there are a few too many
# options to remember.
#
# Please be considerate in the way you hammer webservers with this script.
#
# Iain Murray, November 2006

set -e

if [ $# -ne 1 ] ; then
    echo usage: `basename $0` base_url
    echo
    echo `basename $0` will drive wget to grab base_url and all material linked to in
    echo its directory or child directories. The pages are dumped into your
    echo current directory, which must be empty. Links in html pages are converted so
    echo they should still work when using the local copy.
    echo

    exit 1
fi

if [ `ls -A | wc -l` -ne 0 ] ; then
    echo I kinda expected to be run in an empty directory. Chickening out...
    exit 1
fi

LEVELS=`echo -n "$1" | sed -e 's/^[a-z]*:\/\///' -e 's/\/$//' -e 's/[^/]//g' | wc -c`
date > SOURCE
echo snaffle "$1" >> SOURCE
echo which ran >> SOURCE
echo wget -r -np -p -k -nH --cut-dirs="$LEVELS" "$1" >> SOURCE
wget -r -np -p -k -nH --cut-dirs="$LEVELS" "$1"
date >> SOURCE

# consider using --no-cache

# Options I like:
#        -r  recursive
#        -np no-parent, stops wget trying to download whole website
#        -p  page requisites (so always get all the images and stuff)
#        -k  convert links so they will actually work on my computer
#        -nH I don't want a directory with the host name created
# --cut-dirs This is set so that child directories will be created but the level
#            given in the url will be saved in '.' rather than a zillion
#            sub-directories deep.