#!/bin/bash
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Usage: crawl [options] <crawl_dir> <num_rounds>
#
# Arguments:
#   <crawl_dir>                           Directory where the crawl/host/link/segments dirs are saved
#   <num_rounds>                          The number of rounds to run this crawl for
#
# Options:
#   -i|--index                            Indexes crawl results into a configured indexer
#   -D <propery>=<value>                  A Nutch or Hadoop property to pass to Nutch calls overwriting
#                                         properties defined in configuration files, e.g.
#                                           increase content limit to 2MB:
#                                             -D http.content.limit=2097152
#                                         (in distributed mode) configure memory of map and reduce tasks:
#                                           -D mapreduce.map.memory.mb=4608    -D mapreduce.map.java.opts=-Xmx4096m
#                                           -D mapreduce.reduce.memory.mb=4608 -D mapreduce.reduce.java.opts=-Xmx4096m
#   -w|--wait <NUMBER[SUFFIX]>            Time to wait before generating a new segment when no URLs
#                                         are scheduled for fetching. Suffix can be: s for second,
#                                         m for minute, h for hour and d for day. If no suffix is
#                                         specified second is used by default. [default: -1]
#   -s <seed_dir>                         Path to seeds file(s)
#   -sm <sitemap_dir>                     Path to sitemap URL file(s)
#
#   --hostdbupdate                        Boolean indicator if we call hostdbupdate or not
#   --hostdbgenerate                      Boolean indicator if we use hostdb in generate or not
#
#   --num-fetchers <num_fetchers>         Number of tasks used for fetching (fetcher map tasks) [default: 1]
#                                         Note: This can only be set when running in distributed mode and
#                                               should correspond to the number of worker nodes in the cluster.
#   --num-tasks <num_tasks>               Number of reducer tasks [default: 2]
#   --size-fetchlist <size_fetchlist>     Number of URLs to fetch in one iteration [default: 50000]
#   --time-limit-fetch <time_limit_fetch> Number of minutes allocated to the fetching [default: 180]
#   --num-threads <num_threads>           Number of threads for fetching / sitemap processing [default: 50]
#
#   --dedup-group <none|host|domain>       Deduplication group method [default: none]
#

function __to_seconds() {
  NUMBER=$(echo $1 | tr -dc '0-9')
  MODIFIER=$(echo $1 | tr -dc '[^s|h|m|d]]')

  case $MODIFIER in
      m|M)
        SECONDS=`expr $NUMBER \* 60`
        ;;
      h|H)
        SECONDS=`expr $NUMBER \* 120`
        ;;
      d|D)
        SECONDS=`expr $NUMBER \* 86400`
        ;;
      s|S|*)
        SECONDS=$NUMBER
        ;;
  esac

  echo $SECONDS
}

function __print_usage {
  echo "Usage: crawl [options] <crawl_dir> <num_rounds>"
  echo -e ""
  echo -e "Arguments:"
  echo -e "  <crawl_dir>\t\t\t\tDirectory where the crawl/host/link/segments dirs are saved"
  echo -e "  <num_rounds>\t\t\t\tThe number of rounds to run this crawl for"
  echo -e ""
  echo -e "Options:"
  echo -e "  -i|--index\t\t\t\tIndexes crawl results into a configured indexer"
  echo -e "  -D\t\t\t\t\tA Nutch or Hadoop property to pass to Nutch calls overwriting"
  echo -e "  \t\t\t\t\tproperties defined in configuration files, e.g."
  echo -e "  \t\t\t\t\tincrease content limit to 2MB:"
  echo -e "  \t\t\t\t\t  -D http.content.limit=2097152"
  echo -e "  \t\t\t\t\t(distributed mode only) configure memory of map and reduce tasks:"
  echo -e "  \t\t\t\t\t  -D mapreduce.map.memory.mb=4608    -D mapreduce.map.java.opts=-Xmx4096m"
  echo -e "  \t\t\t\t\t  -D mapreduce.reduce.memory.mb=4608 -D mapreduce.reduce.java.opts=-Xmx4096m"
  echo -e "  -w|--wait <NUMBER[SUFFIX]>\t\tTime to wait before generating a new segment when no URLs"
  echo -e "  \t\t\t\t\tare scheduled for fetching. Suffix can be: s for second,"
  echo -e "  \t\t\t\t\tm for minute, h for hour and d for day. If no suffix is"
  echo -e "  \t\t\t\t\tspecified second is used by default. [default: -1]"
  echo -e "  -s <seed_dir>\t\t\t\tPath to seeds file(s)"
  echo -e "  -sm <sitemap_dir>\t\t\tPath to sitemap URL file(s)"
  echo -e "  --hostdbupdate\t\t\tBoolean flag showing if we either update or not update hostdb for each round"
  echo -e "  --hostdbgenerate\t\t\tBoolean flag showing if we use hostdb in generate or not"
  echo -e "  --num-fetchers <num_fetchers>\t\tNumber of tasks used for fetching (fetcher map tasks) [default: 1]"
  echo -e "  \t\t\t\t\tNote: This can only be set when running in distributed mode and"
  echo -e "  \t\t\t\t\t      should correspond to the number of worker nodes in the cluster."
  echo -e "  --num-tasks <num_tasks>\t\tNumber of reducer tasks [default: 2]"
  echo -e "  --size-fetchlist <size_fetchlist>\tNumber of URLs to fetch in one iteration [default: 50000]"
  echo -e "  --time-limit-fetch <time_limit_fetch>\tNumber of minutes allocated to the fetching [default: 180]"
  echo -e "  --num-threads <num_threads>\t\tNumber of threads for fetching / sitemap processing [default: 50]"
  echo -e "  --sitemaps-from-hostdb <frequency>\tWhether and how often to process sitemaps based on HostDB."
  echo -e "  \t\t\t\t\tSupported values are:"
  echo -e "  \t\t\t\t\t  - never [default]"
  echo -e "  \t\t\t\t\t  - always (processing takes place in every iteration)"
  echo -e "  \t\t\t\t\t  - once (processing only takes place in the first iteration)"
  echo -e "  --dedup-group <none|host|domain>\tDeduplication group method [default: none]"

  exit 1
}

# default values
INDEXFLAG=false
HOSTDBUPDATE=false
HOSTDBGENERATE=false
HADOOP_PROPERTIES=()
WAIT=-1 # don't wait if there are no URLs to fetch
SEEDDIR=""
NUM_FETCHERS=1
NUM_TASKS=2 # 2 x NUM_FETCHERS
SIZE_FETCHLIST=50000 # 25K x NUM_TASKS
TIME_LIMIT_FETCH=180
NUM_THREADS=50
SITEMAPS_FROM_HOSTDB_FREQUENCY=never
DEDUP_GROUP=none

while [[ $# > 0 ]]
do
    case $1 in
        -i|--index)
            INDEXFLAG=true
            shift
            ;;
        -D)
            HADOOP_PROPERTIES=("${HADOOP_PROPERTIES[@]}" -D"${2}")
            shift 2
            ;;
        -s)
            SEEDDIR="${2}"
            shift 2
            ;;
        -sm)
            SITEMAPDIR="${2}"
            shift 2
            ;;
        -w|--wait)
            WAIT="${2}"
            shift 2
            ;;
        --num-slaves)
            # back-ward compatibility: NUTCH-2759 renamed option --num-slaves to --num-fetchers
            NUM_FETCHERS="${2}"
            shift 2
            ;;
        --num-fetchers)
            NUM_FETCHERS="${2}"
            shift 2
            ;;
        --num-tasks)
            NUM_TASKS="${2}"
            shift 2
            ;;
        --size-fetchlist)
            SIZE_FETCHLIST="${2}"
            shift 2
            ;;
        --time-limit-fetch)
            TIME_LIMIT_FETCH="${2}"
            shift 2
            ;;
        --num-threads)
            NUM_THREADS="${2}"
            shift 2
            ;;
        --sitemaps-from-hostdb)
            SITEMAPS_FROM_HOSTDB_FREQUENCY="${2}"
            shift 2
            ;;
        --dedup-group)
            DEDUP_GROUP="${2}"
            shift 2
            ;;
        --hostdbupdate)
            HOSTDBUPDATE=true
            shift
            ;;
        --hostdbgenerate)
            HOSTDBGENERATE=true
            shift
            ;;
        *)
            break
            ;;
    esac
done

if [[ ! "$SITEMAPS_FROM_HOSTDB_FREQUENCY" =~ ^(never|always|once)$ ]]; then
  echo "Error: --sitemaps-from-hostdb <frequency> has to be one of never, always, once."
  echo -e ""
  __print_usage
fi

if [[ ! "$DEDUP_GROUP" =~ ^(none|host|domain)$ ]]; then
  echo "Error: --dedup-group <mode> has to be one of none, host, domain."
  echo -e ""
  __print_usage
fi

if [[ $# != 2 ]]; then
  __print_usage
fi

CRAWL_PATH="$1"
LIMIT="$2"

# convert wait time to seconds for compatibility reasons
if [ "$WAIT" != "-1" ]; then
  WAIT=$( __to_seconds "$WAIT" )
  echo "Time to wait (--wait) = $WAIT sec."
fi

bin="`dirname "$0"`"
bin="`cd "$bin"; pwd`"

# determines whether mode based on presence of job file
mode=local
if [ -f "${bin}"/../*nutch*.job ]; then
  mode=distributed
fi
if [[ "$mode" = "local" ]]; then
  if [[ "$NUM_FETCHERS" -ne 1 ]]; then
    echo "Ignoring configured number of fetchers (--num_fetchers): a single fetcher task is used when running in local mode."
  fi
  NUM_FETCHERS=1
fi

# note that some of the options listed here could be set in the
# corresponding hadoop site xml param file
commonOptions=("${HADOOP_PROPERTIES[@]}" -Dmapreduce.job.reduces=$NUM_TASKS -Dmapreduce.reduce.speculative=false -Dmapreduce.map.speculative=false -Dmapreduce.map.output.compress=true)

 # check that hadoop can be found on the path
if [ $mode = "distributed" ]; then
 if [ $(which hadoop | wc -l ) -eq 0 ]; then
    echo "Can't find Hadoop executable. Add HADOOP_COMMON_HOME/bin to the path or run in local mode."
    exit -1;
 fi
fi


function __bin_nutch {
    # run $bin/nutch, exit if exit value indicates error

    echo "$bin/nutch $@" ;# echo command and arguments
    "$bin/nutch" "$@"

    RETCODE=$?
    if [ $RETCODE -ne 0 ]
    then
        echo "Error running:"
        echo "  $bin/nutch $@"
        echo "Failed with exit value $RETCODE."
        exit $RETCODE
    fi
}

# check if directory exists locally or on hdfs
function __directory_exists {
  if [[ "$mode" == local  &&  -d "$1" ]]; then
    return 0
  elif [[ "$mode" == distributed ]] && hadoop fs -test -d "$1"; then
    return 0
  else
    return 1
  fi
}

function __update_hostdb {
  if __directory_exists "$CRAWL_PATH"/crawldb; then
    echo "Updating HostDB"
    __bin_nutch updatehostdb "${commonOptions[@]}" -crawldb "$CRAWL_PATH"/crawldb -hostdb "$CRAWL_PATH"/hostdb
  fi
}

# initial injection
if [[ ! -z $SEEDDIR  ]]; then
  echo "Injecting seed URLs"
  __bin_nutch inject "${commonOptions[@]}" "$CRAWL_PATH"/crawldb "$SEEDDIR"
fi

# sitemap processing based on sitemap definition file(s)
if [[ ! -z $SITEMAPDIR ]]; then
  echo "Processing sitemaps defined in $SITEMAPDIR"
  __bin_nutch sitemap "${commonOptions[@]}" "$CRAWL_PATH/crawldb" -sitemapUrls "$SITEMAPDIR" -threads $NUM_THREADS
fi

# main loop : rounds of generate - fetch - parse - update
for ((a=1; ; a++))
do
  if [ -e ".STOP" ]; then
    echo "STOP file found - escaping loop"
    break
  fi

  if [ $LIMIT -ne -1 ]; then
    if [ $a -gt $LIMIT ]; then
      echo `date` ": Finished loop with $LIMIT iterations"
      break
    fi
    echo `date` ": Iteration $a of $LIMIT"
  else
    echo `date` ": Iteration $a"
  fi

  if [[ "$SITEMAPS_FROM_HOSTDB_FREQUENCY" = "always" || ("$SITEMAPS_FROM_HOSTDB_FREQUENCY" = "once" && $a -eq 1) ]]; then
    # create / update HostDB on first run
    [[ $a -eq 1 ]] && __update_hostdb

    # sitemap processing based on HostDB
    if __directory_exists "$CRAWL_PATH"/hostdb; then
      echo "Processing sitemaps based on hosts in HostDB"
      __bin_nutch sitemap "${commonOptions[@]}" "$CRAWL_PATH"/crawldb -hostdb "$CRAWL_PATH"/hostdb -threads $NUM_THREADS
    fi
  fi

  echo "Generating a new segment"
  if [[ "$HOSTDBGENERATE" == "true" ]] && __directory_exists "$CRAWL_PATH"/hostdb; then
   generate_args=("${commonOptions[@]}" "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $SIZE_FETCHLIST -numFetchers $NUM_FETCHERS -noFilter -hostdb "$CRAWL_PATH"/hostdb)
  else
   generate_args=("${commonOptions[@]}" "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $SIZE_FETCHLIST -numFetchers $NUM_FETCHERS -noFilter)
  fi

  echo "$bin/nutch generate ${generate_args[@]}"
  $bin/nutch generate "${generate_args[@]}"
  RETCODE=$?
  if [ $RETCODE -eq 0 ]; then
      : # ok: no error
  elif [ $RETCODE -eq 1 ]; then
    echo "Generate returned 1 (no new segments created)"

    if [ "$WAIT" -ne -1 ]; then
      echo "Waiting for $WAIT sec. ..."
      sleep $WAIT
      continue
    else
      echo "Escaping loop: no more URLs to fetch now"
      break
    fi
  else
    echo "Error running:"
    echo "  $bin/nutch generate ${generate_args[@]}"
    echo "Failed with exit value $RETCODE."
    exit $RETCODE
  fi

  # capture the name of the segment
  # call hadoop in distributed mode
  # or use ls

  if [ $mode = "local" ]; then
   SEGMENT=`ls "$CRAWL_PATH"/segments/ | sort -n | tail -n 1`
  else
   SEGMENT=`hadoop fs -ls "$CRAWL_PATH"/segments/ | grep segments |  sed -e "s/\//\\n/g" | egrep 20[0-9]+ | sort -n | tail -n 1`
  fi

  echo "Operating on segment : $SEGMENT"

  # fetching the segment
  echo "Fetching : $SEGMENT"
  __bin_nutch fetch "${commonOptions[@]}" -D fetcher.timelimit.mins=$TIME_LIMIT_FETCH "$CRAWL_PATH"/segments/$SEGMENT -threads $NUM_THREADS

  # parsing the segment
  echo "Parsing : $SEGMENT"
  # enable the skipping of records for the parsing so that a dodgy document
  # so that it does not fail the full task
  skipRecordsOptions="-D mapreduce.task.skip.start.attempts=2 -D mapreduce.map.skip.maxrecords=1"
  __bin_nutch parse "${commonOptions[@]}" $skipRecordsOptions "$CRAWL_PATH"/segments/$SEGMENT

  # updatedb with this segment
  echo "CrawlDB update"
  __bin_nutch updatedb "${commonOptions[@]}" "$CRAWL_PATH"/crawldb  "$CRAWL_PATH"/segments/$SEGMENT

  echo "HostDB update"
  if $HOSTDBUPDATE; then
  __update_hostdb
  fi

  # Note that all steps below in this loop (link inversion, deduplication, indexing)
  # can be done
  # - either inside the loop on a per segment basis
  # - or after the loop over all segments created in all loop iterations
  #   (both invertlinks and index accept multiple segments as input)
  # The latter is more efficient but the index is then updated later.
  echo "Link inversion"
  __bin_nutch invertlinks "${commonOptions[@]}" "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT -noNormalize -nofilter

  echo "Dedup on crawldb"
  __bin_nutch dedup "${commonOptions[@]}" "$CRAWL_PATH"/crawldb -group "$DEDUP_GROUP"

  if $INDEXFLAG; then
      echo "Indexing $SEGMENT to index"
      __bin_nutch index "${commonOptions[@]}" "$CRAWL_PATH"/crawldb -linkdb "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT -deleteGone
  else
      echo "Skipping indexing ..."
  fi

done

#######################################################
# The following commands fall into WebGraph territory
# and should be uncommented based on your requirements
#######################################################
#echo "Building WebGraph within $CRAWL_PATH on all segments in $CRAWL_PATH/segments/"
#__bin_nutch webgraph "${commonOptions[@]}" -filter -normalize -segmentDir "$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH"

#echo "Running Loops Job on WebGraph within $CRAWL_PATH"
#__bin_nutch org.apache.nutch.scoring.webgraph.Loops "${commonOptions[@]}" -webgraphdb "$CRAWL_PATH"

#echo "Running LinkRank Algorithm on WebGraph within $CRAWL_PATH"
#__bin_nutch linkrank "${commonOptions[@]}" -webgraphdb "$CRAWL_PATH"

#echo "Running ScoreUpdater Job with $CRAWL_PATH/crawldb and  WebGraph within $CRAWL_PATH"
#__bin_nutch scoreupdater "${commonOptions[@]}" -crawldb "$CRAWL_PATH"/crawldb -webgraphdb "$CRAWL_PATH"

#echo "Running NodeDumper on WebGraph within $CRAWL_PATH and dumping output to $CRAWL_PATH/dump/scores"
#__bin_nutch nodedumper "${commonOptions[@]}" -scores -topn 1000 -webgraphdb "$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores

exit 0
