# Copyright (c) 1994 Regents of the University of California.
# All rights reserved.
# $Id: momconfig.pl,v 1.3 1994/08/10 10:18:29 fielding Exp $
# ---------------------------------------------------------------------------
# momconfig: A package for setting the configuration options of a
# World-Wide Web spider. This package exists so that all
# user-configurable defaults can be set in one package and then
# used by all of the mom* packages.
#
# Before changing things here, the installer should first:
#
# 1) Read and follow the installation instructions in docs/INSTALL.txt
# 2) Set the three things that need to be set in "momspider"
# a) The first line which specifies the perl interpreter;
# b) The INClude path for libwww-perl library packages $WWWlib
# c) The INClude path for MOMspider library packages $MOMlib
#
# Note that, except for LocalNetwork, it may not be necessary to change
# anything here if your operating system setup is similar to mine.
# Note also that this is real Perl code -- if you don't understand the
# syntax, take a look at the Perl manual (man perl) or at one of the many
# hypertext archives of Perl info, e.g. .
#
# This software has been developed by Roy Fielding as
# part of the Arcadia project at the University of California, Irvine.
#
# Redistribution and use in source and binary forms are permitted,
# subject to the restriction noted below, provided that the above
# copyright notice and this paragraph and the following paragraphs are
# duplicated in all such forms and that any documentation, advertising
# materials, and other materials related to such distribution and use
# acknowledge that the software was developed in part by the University of
# California, Irvine. The name of the University may not be used to
# endorse or promote products derived from this software without
# specific prior written permission. THIS SOFTWARE IS PROVIDED ``AS IS''
# AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT
# LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE.
#
# Use of this software in any way or in any form, source or binary,
# is not allowed in any country which prohibits disclaimers of any
# implied warranties of merchantability or fitness for a particular
# purpose or any disclaimers of a similar nature.
#
# IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY
# FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
# ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION
# (INCLUDING, BUT NOT LIMITED TO, LOST PROFITS) EVEN IF THE UNIVERSITY
# OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# If you have any suggestions, bug reports, fixes, or enhancements,
# send them to the author Roy Fielding at .
# ---------------------------------------------------------------------------
package momconfig;
# ==========================================================================
# These first four global variables are needed by later options, but
# should not normally be changed by the installer.
$MOMlib = $main'MOMlib;
$HOMEdir = ($ENV{'HOME'} || $ENV{'home'} || '.');
$PWDdir = ($ENV{'PWD'} || $ENV{'cwd'} || '.');
$TMPdir = ($ENV{'TMPDIR'} || '/tmp');
# ==========================================================================
# Local Network should be the network domain which you consider to be local.
# (i.e., a network request to sites in this domain do not create any external
# network costs to your organization). YOU WILL WANT TO CHANGE THIS!!!
$LocalNetwork = '\.uci\.edu'; # Use backslash to escape any periods
# ==========================================================================
# sendmailCommand should point to the sendmail binary. The assumption is
# that this program accepts command-line arguments specifying addresses to
# which messages should be mailed, and accepts other headers and message
# text from stdin.
$sendmailCommand = "/usr/sbin/sendmail";
# ==========================================================================
# The following options allow MOMspider to decode traversable response
# content that has been encoded (so far, this only means compressed).
# This may never be used if your site does not compress any HTML files.
# The following association maps content-encodings to their decoder command.
%CEdecoder = (
'x-compress', 'uncompress',
'x-gzip', 'gunzip',
);
# The following association maps content-encodings to the file extension
# expected by the decoder.
%CEextension = (
'x-compress', '.Z',
'x-gzip', '.gz',
);
# The following sets the temporary filename [without .(Z|gz) suffix] for
# file decoding.
$CEfile = "$TMPdir/mom$$-comp.html"; # Temporary file for decompression
# ==========================================================================
# Set the default location of the working Index file.
$TempIndex = "$TMPdir/mom$$-index.html";
# Standard filename extension for HTML index files (and old version)
$Extension = 'html';
$OldExtension = 'old.html';
# ==========================================================================
# Set the default location of the instruction file and allowed tasks.
$InstructFile = "$HOMEdir/.momspider-instruct";
# Edit the following lines to specify whether (1) or not (0) you want
# to allow the specified option.
%Allowed = ( # Allow MOMspider to traverse ...
'Owner', 1, # all links via TopURL owned by owner?
'Tree', 1, # all links at or below TopURL?
'Site', 1, # all links at TopURL's site?
);
# ==========================================================================
# Set the default location for the avoid and sites files. These locations
# can be overridden in the instructions or on the command line.
# NOTE that the avoid and sites files must always be used in pairs
# since the contents of each file are dependent on the other.
# SystemAvoid should point to the systemwide avoidance file -- a structured
# list of URLs that all users of MOMspider must avoid (or leaf).
$SystemAvoid = "$MOMlib/system-avoid";
# SystemSites should point to the systemwide sites file -- a structured
# list of IP addresses, ports, and dates which indicate when the sites
# should next be checked for RobotsNotWanted restrictions.
$SystemSites = "$MOMlib/system-sites";
# The user AvoidFile and SitesFile are exactly the same but are intended
# to be written by whomever is running the spider.
$AvoidFile = "$HOMEdir/.momspider-avoid"; # Default user avoid file
$SitesFile = "$HOMEdir/.momspider-sites"; # Default user sites file
# The following is the standard name for the URL which defines for
# any site where Robots are not allowed. See Martijn Koster's proposal
# at for more info.
$RobotsURL = "/robots.txt";
# Set the default number of days between checks of a site's $RobotsURL.
# This can be overridden in the instructions.
$CheckInterval = 15;
# ==========================================================================
# Set things which control the traversal process.
$MaxDepth = 20; # Default maximum traversal depth.
# Can be overridden by the instructions or commandline.
$Timeout = 30; # The maximum number of seconds to wait for a response.
# Increase if you have an extremely slow net connection.
$MaxConsec = 5; # Max number of consecutive requests to any site
# before a long pause is required. Don't change it.
$PauseTime = 60; # The number of seconds for a long pause.
# Increase if your server is very slow.
$BetweenTime = 15; # Amount of time required between any two requests
# to the same site. Increase if server is slow.
$BaseURL = "file://localhost$PWDdir/";
# The initial Base URL -- no need to change this.
# ==========================================================================
# DO NOT change the following unless you know exactly what you are doing
# AND have checked first with Roy Fielding .
$Version = $main'Version;
# The location for distribution information about MOMspider
$DistInfo =
"$Version";
# ==========================================================================
1; # THIS LINE MUST BE LAST -- DO NOT CHANGE IT