public class StagedOutputJob
extends org.apache.hadoop.mapreduce.Job
implements java.util.concurrent.Callable<java.lang.Boolean>
Job
that stages its output in another location and only
moves it to the final destination if the job completes successfully.
It also outputs a counters file to the file system that contains counters fetched from Hadoop
and other task statistics.COMPLETION_POLL_INTERVAL_KEY, DEFAULT_SUBMIT_REPLICATION, OUTPUT_FILTER, PROGRESS_MONITOR_POLL_INTERVAL_KEY, SUBMIT_REPLICATION, USED_GENERIC_PARSER
APPLICATION_ATTEMPT_ID, APPLICATION_MASTER_CLASS, CACHE_ARCHIVES, CACHE_ARCHIVES_SIZES, CACHE_ARCHIVES_TIMESTAMPS, CACHE_ARCHIVES_VISIBILITIES, CACHE_FILE_TIMESTAMPS, CACHE_FILE_VISIBILITIES, CACHE_FILES, CACHE_FILES_SIZES, CACHE_LOCALARCHIVES, CACHE_LOCALFILES, CACHE_SYMLINK, CLASSPATH_ARCHIVES, CLASSPATH_FILES, COMBINE_CLASS_ATTR, COMBINE_RECORDS_BEFORE_PROGRESS, COMBINER_GROUP_COMPARATOR_CLASS, COMPLETED_MAPS_FOR_REDUCE_SLOWSTART, COUNTER_GROUP_NAME_MAX_DEFAULT, COUNTER_GROUP_NAME_MAX_KEY, COUNTER_GROUPS_MAX_DEFAULT, COUNTER_GROUPS_MAX_KEY, COUNTER_NAME_MAX_DEFAULT, COUNTER_NAME_MAX_KEY, COUNTERS_MAX_DEFAULT, COUNTERS_MAX_KEY, DEFAULT_JOB_ACL_MODIFY_JOB, DEFAULT_JOB_ACL_VIEW_JOB, DEFAULT_JOB_AM_ACCESS_DISABLED, DEFAULT_JOB_RUNNING_MAP_LIMIT, DEFAULT_JOB_RUNNING_REDUCE_LIMIT, DEFAULT_JOB_TOKEN_TRACKING_IDS_ENABLED, DEFAULT_LOG_LEVEL, DEFAULT_MAP_CPU_VCORES, DEFAULT_MAP_MEMORY_MB, DEFAULT_MAPRED_ADMIN_JAVA_OPTS, DEFAULT_MAPRED_ADMIN_USER_ENV, DEFAULT_MAPREDUCE_APPLICATION_CLASSPATH, DEFAULT_MAPREDUCE_CROSS_PLATFORM_APPLICATION_CLASSPATH, DEFAULT_MAPREDUCE_JOB_EMIT_TIMELINE_DATA, DEFAULT_MAX_ALLOWED_FETCH_FAILURES_FRACTION, DEFAULT_MAX_FETCH_FAILURES_NOTIFICATIONS, DEFAULT_MAX_SHUFFLE_FETCH_HOST_FAILURES, DEFAULT_MAX_SHUFFLE_FETCH_RETRY_DELAY, DEFAULT_MR_AM_ADMIN_COMMAND_OPTS, DEFAULT_MR_AM_COMMAND_OPTS, DEFAULT_MR_AM_COMMIT_WINDOW_MS, DEFAULT_MR_AM_COMMITTER_CANCEL_TIMEOUT_MS, DEFAULT_MR_AM_CONTAINERLAUNCHER_THREAD_COUNT_LIMIT, DEFAULT_MR_AM_CONTAINERLAUNCHER_THREADPOOL_INITIAL_SIZE, DEFAULT_MR_AM_CPU_VCORES, DEFAULT_MR_AM_HARD_KILL_TIMEOUT_MS, DEFAULT_MR_AM_HISTORY_COMPLETE_EVENT_FLUSH_TIMEOUT_MS, DEFAULT_MR_AM_HISTORY_JOB_COMPLETE_UNFLUSHED_MULTIPLIER, DEFAULT_MR_AM_HISTORY_MAX_UNFLUSHED_COMPLETE_EVENTS, DEFAULT_MR_AM_HISTORY_USE_BATCHED_FLUSH_QUEUE_SIZE_THRESHOLD, DEFAULT_MR_AM_IGNORE_BLACKLISTING_BLACKLISTED_NODE_PERCENT, DEFAULT_MR_AM_JOB_CLIENT_THREAD_COUNT, DEFAULT_MR_AM_JOB_REDUCE_PREEMPTION_LIMIT, DEFAULT_MR_AM_JOB_REDUCE_RAMP_UP_LIMIT, DEFAULT_MR_AM_LOG_BACKUPS, DEFAULT_MR_AM_LOG_KB, DEFAULT_MR_AM_LOG_LEVEL, DEFAULT_MR_AM_MAX_ATTEMPTS, DEFAULT_MR_AM_NUM_PROGRESS_SPLITS, DEFAULT_MR_AM_PROFILE, DEFAULT_MR_AM_STAGING_DIR, DEFAULT_MR_AM_TASK_ESTIMATOR_SMOOTH_LAMBDA_MS, DEFAULT_MR_AM_TASK_LISTENER_THREAD_COUNT, DEFAULT_MR_AM_TO_RM_HEARTBEAT_INTERVAL_MS, DEFAULT_MR_AM_TO_RM_WAIT_INTERVAL_MS, DEFAULT_MR_AM_VMEM_MB, DEFAULT_MR_CLIENT_MAX_RETRIES, DEFAULT_MR_CLIENT_TO_AM_IPC_MAX_RETRIES, DEFAULT_MR_CLIENT_TO_AM_IPC_MAX_RETRIES_ON_TIMEOUTS, DEFAULT_MR_ENCRYPTED_INTERMEDIATE_DATA, DEFAULT_MR_ENCRYPTED_INTERMEDIATE_DATA_BUFFER_KB, DEFAULT_MR_ENCRYPTED_INTERMEDIATE_DATA_KEY_SIZE_BITS, DEFAULT_MR_JOB_END_NOTIFICATION_TIMEOUT, DEFAULT_MR_JOB_REDUCER_PREEMPT_DELAY_SEC, DEFAULT_REDUCE_CPU_VCORES, DEFAULT_REDUCE_MEMORY_MB, DEFAULT_REDUCE_SEPARATE_SHUFFLE_LOG, DEFAULT_SHELL, DEFAULT_SHUFFLE_FETCH_RETRY_INTERVAL_MS, DEFAULT_SHUFFLE_INPUT_BUFFER_PERCENT, DEFAULT_SHUFFLE_LOG_BACKUPS, DEFAULT_SHUFFLE_LOG_KB, DEFAULT_SPECULATIVE_MINIMUM_ALLOWED_TASKS, DEFAULT_SPECULATIVE_RETRY_AFTER_NO_SPECULATE, DEFAULT_SPECULATIVE_RETRY_AFTER_SPECULATE, DEFAULT_SPECULATIVECAP_RUNNING_TASKS, DEFAULT_SPECULATIVECAP_TOTAL_TASKS, DEFAULT_SPLIT_METAINFO_MAXSIZE, DEFAULT_TASK_ISMAP, DEFAULT_TASK_LOG_BACKUPS, DEFAULT_TASK_PROFILE_PARAMS, GROUP_COMPARATOR_CLASS, HADOOP_WORK_DIR, ID, INDEX_CACHE_MEMORY_LIMIT, INPUT_FORMAT_CLASS_ATTR, IO_SORT_FACTOR, IO_SORT_MB, JAR, JAR_UNPACK_PATTERN, JOB_ACL_MODIFY_JOB, JOB_ACL_VIEW_JOB, JOB_AM_ACCESS_DISABLED, JOB_CANCEL_DELEGATION_TOKEN, JOB_CONF_FILE, JOB_JAR, JOB_JOBTRACKER_ID, JOB_LOCAL_DIR, JOB_NAME, JOB_NAMENODES, JOB_RUNNING_MAP_LIMIT, JOB_RUNNING_REDUCE_LIMIT, JOB_SPLIT, JOB_SPLIT_METAINFO, JOB_SUBMIT_DIR, JOB_SUBMITHOST, JOB_SUBMITHOSTADDR, JOB_TAGS, JOB_TOKEN_TRACKING_IDS, JOB_TOKEN_TRACKING_IDS_ENABLED, JOB_UBERTASK_ENABLE, JOB_UBERTASK_MAXBYTES, JOB_UBERTASK_MAXMAPS, JOB_UBERTASK_MAXREDUCES, JVM_NUMTASKS_TORUN, KEY_COMPARATOR, MAP_CLASS_ATTR, MAP_COMBINE_MIN_SPILLS, MAP_CPU_VCORES, MAP_DEBUG_SCRIPT, MAP_ENV, MAP_FAILURES_MAX_PERCENT, MAP_INPUT_FILE, MAP_INPUT_PATH, MAP_INPUT_START, MAP_JAVA_OPTS, MAP_LOG_LEVEL, MAP_MAX_ATTEMPTS, MAP_MEMORY_MB, MAP_OUTPUT_COLLECTOR_CLASS_ATTR, MAP_OUTPUT_COMPRESS, MAP_OUTPUT_COMPRESS_CODEC, MAP_OUTPUT_KEY_CLASS, MAP_OUTPUT_KEY_FIELD_SEPERATOR, MAP_OUTPUT_VALUE_CLASS, MAP_SKIP_INCR_PROC_COUNT, MAP_SKIP_MAX_RECORDS, MAP_SORT_SPILL_PERCENT, MAP_SPECULATIVE, MAPRED_ADMIN_USER_ENV, MAPRED_ADMIN_USER_SHELL, MAPRED_MAP_ADMIN_JAVA_OPTS, MAPRED_REDUCE_ADMIN_JAVA_OPTS, MAPREDUCE_APPLICATION_CLASSPATH, MAPREDUCE_APPLICATION_FRAMEWORK_PATH, MAPREDUCE_JOB_CLASSLOADER, MAPREDUCE_JOB_CLASSLOADER_SYSTEM_CLASSES, MAPREDUCE_JOB_CREDENTIALS_BINARY, MAPREDUCE_JOB_DIR, MAPREDUCE_JOB_EMIT_TIMELINE_DATA, MAPREDUCE_JOB_LOG4J_PROPERTIES_FILE, MAPREDUCE_JOB_SHUFFLE_PROVIDER_SERVICES, MAPREDUCE_JOB_USER_CLASSPATH_FIRST, MAPREDUCE_V2_CHILD_CLASS, MAX_ALLOWED_FETCH_FAILURES_FRACTION, MAX_FETCH_FAILURES_NOTIFICATIONS, MAX_SHUFFLE_FETCH_HOST_FAILURES, MAX_SHUFFLE_FETCH_RETRY_DELAY, MAX_TASK_FAILURES_PER_TRACKER, MR_AM_ADMIN_COMMAND_OPTS, MR_AM_ADMIN_USER_ENV, MR_AM_COMMAND_OPTS, MR_AM_COMMIT_WINDOW_MS, MR_AM_COMMITTER_CANCEL_TIMEOUT_MS, MR_AM_CONTAINERLAUNCHER_THREAD_COUNT_LIMIT, MR_AM_CONTAINERLAUNCHER_THREADPOOL_INITIAL_SIZE, MR_AM_CPU_VCORES, MR_AM_CREATE_JH_INTERMEDIATE_BASE_DIR, MR_AM_ENV, MR_AM_HARD_KILL_TIMEOUT_MS, MR_AM_HISTORY_COMPLETE_EVENT_FLUSH_TIMEOUT_MS, MR_AM_HISTORY_JOB_COMPLETE_UNFLUSHED_MULTIPLIER, MR_AM_HISTORY_MAX_UNFLUSHED_COMPLETE_EVENTS, MR_AM_HISTORY_USE_BATCHED_FLUSH_QUEUE_SIZE_THRESHOLD, MR_AM_IGNORE_BLACKLISTING_BLACKLISTED_NODE_PERECENT, MR_AM_JOB_CLIENT_PORT_RANGE, MR_AM_JOB_CLIENT_THREAD_COUNT, MR_AM_JOB_NODE_BLACKLISTING_ENABLE, MR_AM_JOB_RECOVERY_ENABLE, MR_AM_JOB_RECOVERY_ENABLE_DEFAULT, MR_AM_JOB_REDUCE_PREEMPTION_LIMIT, MR_AM_JOB_REDUCE_RAMPUP_UP_LIMIT, MR_AM_JOB_SPECULATOR, MR_AM_LOG_BACKUPS, MR_AM_LOG_KB, MR_AM_LOG_LEVEL, MR_AM_MAX_ATTEMPTS, MR_AM_NUM_PROGRESS_SPLITS, MR_AM_PREFIX, MR_AM_PROFILE, MR_AM_PROFILE_PARAMS, MR_AM_SECURITY_SERVICE_AUTHORIZATION_CLIENT, MR_AM_SECURITY_SERVICE_AUTHORIZATION_TASK_UMBILICAL, MR_AM_STAGING_DIR, MR_AM_TASK_ESTIMATOR, MR_AM_TASK_ESTIMATOR_EXPONENTIAL_RATE_ENABLE, MR_AM_TASK_ESTIMATOR_SMOOTH_LAMBDA_MS, MR_AM_TASK_LISTENER_THREAD_COUNT, MR_AM_TO_RM_HEARTBEAT_INTERVAL_MS, MR_AM_TO_RM_WAIT_INTERVAL_MS, MR_AM_VMEM_MB, MR_APPLICATION_TYPE, MR_CLIENT_MAX_RETRIES, MR_CLIENT_TO_AM_IPC_MAX_RETRIES, MR_CLIENT_TO_AM_IPC_MAX_RETRIES_ON_TIMEOUTS, MR_ENCRYPTED_INTERMEDIATE_DATA, MR_ENCRYPTED_INTERMEDIATE_DATA_BUFFER_KB, MR_ENCRYPTED_INTERMEDIATE_DATA_KEY_SIZE_BITS, MR_JOB_END_NOTIFICATION_MAX_ATTEMPTS, MR_JOB_END_NOTIFICATION_MAX_RETRY_INTERVAL, MR_JOB_END_NOTIFICATION_PROXY, MR_JOB_END_NOTIFICATION_TIMEOUT, MR_JOB_END_NOTIFICATION_URL, MR_JOB_END_RETRY_ATTEMPTS, MR_JOB_END_RETRY_INTERVAL, MR_JOB_REDUCER_PREEMPT_DELAY_SEC, MR_PREFIX, NUM_MAP_PROFILES, NUM_MAPS, NUM_REDUCE_PROFILES, NUM_REDUCES, OUTPUT, OUTPUT_FORMAT_CLASS_ATTR, OUTPUT_KEY_CLASS, OUTPUT_VALUE_CLASS, PARTITIONER_CLASS_ATTR, PRESERVE_FAILED_TASK_FILES, PRESERVE_FILES_PATTERN, PRIORITY, QUEUE_NAME, RECORDS_BEFORE_PROGRESS, REDUCE_CLASS_ATTR, REDUCE_CPU_VCORES, REDUCE_DEBUG_SCRIPT, REDUCE_ENV, REDUCE_FAILURES_MAXPERCENT, REDUCE_INPUT_BUFFER_PERCENT, REDUCE_JAVA_OPTS, REDUCE_LOG_LEVEL, REDUCE_MARKRESET_BUFFER_PERCENT, REDUCE_MARKRESET_BUFFER_SIZE, REDUCE_MAX_ATTEMPTS, REDUCE_MEMORY_MB, REDUCE_MEMORY_TOTAL_BYTES, REDUCE_MEMTOMEM_ENABLED, REDUCE_MEMTOMEM_THRESHOLD, REDUCE_MERGE_INMEM_THRESHOLD, REDUCE_SEPARATE_SHUFFLE_LOG, REDUCE_SKIP_INCR_PROC_COUNT, REDUCE_SKIP_MAXGROUPS, REDUCE_SPECULATIVE, RESERVATION_ID, SETUP_CLEANUP_NEEDED, SHUFFLE_CONNECT_TIMEOUT, SHUFFLE_FETCH_FAILURES, SHUFFLE_FETCH_RETRY_ENABLED, SHUFFLE_FETCH_RETRY_INTERVAL_MS, SHUFFLE_FETCH_RETRY_TIMEOUT_MS, SHUFFLE_INPUT_BUFFER_PERCENT, SHUFFLE_LOG_BACKUPS, SHUFFLE_LOG_KB, SHUFFLE_MEMORY_LIMIT_PERCENT, SHUFFLE_MERGE_PERCENT, SHUFFLE_NOTIFY_READERROR, SHUFFLE_PARALLEL_COPIES, SHUFFLE_READ_TIMEOUT, SKIP_OUTDIR, SKIP_RECORDS, SKIP_START_ATTEMPTS, SPECULATIVE_MINIMUM_ALLOWED_TASKS, SPECULATIVE_RETRY_AFTER_NO_SPECULATE, SPECULATIVE_RETRY_AFTER_SPECULATE, SPECULATIVE_SLOWNODE_THRESHOLD, SPECULATIVE_SLOWTASK_THRESHOLD, SPECULATIVECAP, SPECULATIVECAP_RUNNING_TASKS, SPECULATIVECAP_TOTAL_TASKS, SPLIT_FILE, SPLIT_METAINFO_MAXSIZE, STDERR_LOGFILE_ENV, STDOUT_LOGFILE_ENV, TASK_ATTEMPT_ID, TASK_CLEANUP_NEEDED, TASK_DEBUGOUT_LINES, TASK_ID, TASK_ISMAP, TASK_LOG_BACKUPS, TASK_MAP_PROFILE_PARAMS, TASK_OUTPUT_DIR, TASK_PARTITION, TASK_PROFILE, TASK_PROFILE_PARAMS, TASK_REDUCE_PROFILE_PARAMS, TASK_TIMEOUT, TASK_TIMEOUT_CHECK_INTERVAL_MS, TASK_USERLOG_LIMIT, USER_LOG_RETAIN_HOURS, USER_NAME, WORKDIR, WORKFLOW_ADJACENCY_PREFIX_PATTERN, WORKFLOW_ADJACENCY_PREFIX_STRING, WORKFLOW_ID, WORKFLOW_NAME, WORKFLOW_NODE_NAME, WORKFLOW_TAGS, WORKING_DIR
Constructor and Description |
---|
StagedOutputJob(org.apache.hadoop.conf.Configuration conf,
java.lang.String stagingPrefix,
org.apache.log4j.Logger log)
Initializes the job.
|
Modifier and Type | Method and Description |
---|---|
java.lang.Boolean |
call()
Run the job.
|
static StagedOutputJob |
createStagedJob(org.apache.hadoop.conf.Configuration conf,
java.lang.String jobName,
java.util.List<java.lang.String> inputPaths,
java.lang.String stagingLocation,
java.lang.String outputPath,
org.apache.log4j.Logger log)
Creates a job which using a temporary staging location for the output data.
|
org.apache.hadoop.fs.Path |
getCountersParentPath()
Gets path to store the counters.
|
org.apache.hadoop.fs.Path |
getCountersPath()
Path to written counters.
|
boolean |
getWriteCounters()
Get whether counters should be written.
|
void |
setCountersParentPath(org.apache.hadoop.fs.Path path)
Sets path to store the counters.
|
void |
setWriteCounters(boolean writeCounters)
Sets whether counters should be written.
|
boolean |
waitForCompletion(boolean verbose)
Run the job and wait for it to complete.
|
addArchiveToClassPath, addCacheArchive, addCacheFile, addFileToClassPath, cleanupProgress, createSymlink, failTask, getCluster, getCompletionPollInterval, getCounters, getFinishTime, getHistoryUrl, getInstance, getInstance, getInstance, getInstance, getInstance, getInstance, getInstance, getJobFile, getJobName, getJobState, getJobSubmitter, getPriority, getProgressPollInterval, getReservationId, getSchedulingInfo, getStartTime, getStatus, getTaskCompletionEvents, getTaskCompletionEvents, getTaskDiagnostics, getTaskOutputFilter, getTaskReports, getTrackingURL, isComplete, isRetired, isSuccessful, isUber, killJob, killTask, killTask, mapProgress, monitorAndPrintJob, reduceProgress, setCacheArchives, setCacheFiles, setCancelDelegationTokenUponJobCompletion, setCombinerClass, setCombinerKeyGroupingComparatorClass, setGroupingComparatorClass, setInputFormatClass, setJar, setJarByClass, setJobName, setJobSetupCleanupNeeded, setMapOutputKeyClass, setMapOutputValueClass, setMapperClass, setMapSpeculativeExecution, setMaxMapAttempts, setMaxReduceAttempts, setNumReduceTasks, setOutputFormatClass, setOutputKeyClass, setOutputValueClass, setPartitionerClass, setPriority, setProfileEnabled, setProfileParams, setProfileTaskRange, setReducerClass, setReduceSpeculativeExecution, setReservationId, setSortComparatorClass, setSpeculativeExecution, setTaskOutputFilter, setupProgress, setUser, setWorkingDirectory, submit, toString
getArchiveClassPaths, getArchiveTimestamps, getCacheArchives, getCacheFiles, getCombinerClass, getCombinerKeyGroupingComparator, getConfiguration, getCredentials, getFileClassPaths, getFileTimestamps, getGroupingComparator, getInputFormatClass, getJar, getJobID, getJobSetupCleanupNeeded, getLocalCacheArchives, getLocalCacheFiles, getMapOutputKeyClass, getMapOutputValueClass, getMapperClass, getMaxMapAttempts, getMaxReduceAttempts, getNumReduceTasks, getOutputFormatClass, getOutputKeyClass, getOutputValueClass, getPartitionerClass, getProfileEnabled, getProfileParams, getProfileTaskRange, getReducerClass, getSortComparator, getSymlink, getTaskCleanupNeeded, getUser, getWorkingDirectory, setJobID
clone, equals, finalize, getClass, hashCode, notify, notifyAll, wait, wait, wait
getArchiveClassPaths, getArchiveTimestamps, getCacheArchives, getCacheFiles, getCombinerClass, getCombinerKeyGroupingComparator, getConfiguration, getCredentials, getFileClassPaths, getFileTimestamps, getGroupingComparator, getInputFormatClass, getJar, getJobID, getJobSetupCleanupNeeded, getLocalCacheArchives, getLocalCacheFiles, getMapOutputKeyClass, getMapOutputValueClass, getMapperClass, getMaxMapAttempts, getMaxReduceAttempts, getNumReduceTasks, getOutputFormatClass, getOutputKeyClass, getOutputValueClass, getPartitionerClass, getProfileEnabled, getProfileParams, getProfileTaskRange, getReducerClass, getSortComparator, getSymlink, getTaskCleanupNeeded, getUser, getWorkingDirectory
public StagedOutputJob(org.apache.hadoop.conf.Configuration conf, java.lang.String stagingPrefix, org.apache.log4j.Logger log) throws java.io.IOException
conf
- configurationstagingPrefix
- where to stage output temporarilylog
- loggerjava.io.IOException
- IOExceptionpublic static StagedOutputJob createStagedJob(org.apache.hadoop.conf.Configuration conf, java.lang.String jobName, java.util.List<java.lang.String> inputPaths, java.lang.String stagingLocation, java.lang.String outputPath, org.apache.log4j.Logger log)
conf
- configurationjobName
- job nameinputPaths
- input pathsstagingLocation
- where to stage output temporarilyoutputPath
- output pathlog
- loggerpublic org.apache.hadoop.fs.Path getCountersParentPath()
public void setCountersParentPath(org.apache.hadoop.fs.Path path)
path
- parent path for counterspublic org.apache.hadoop.fs.Path getCountersPath()
public boolean getWriteCounters()
public void setWriteCounters(boolean writeCounters)
writeCounters
- true if counters should be writtenpublic java.lang.Boolean call() throws java.lang.Exception
call
in interface java.util.concurrent.Callable<java.lang.Boolean>
java.lang.Exception
public boolean waitForCompletion(boolean verbose) throws java.io.IOException, java.lang.InterruptedException, java.lang.ClassNotFoundException
waitForCompletion
in class org.apache.hadoop.mapreduce.Job
java.io.IOException
java.lang.InterruptedException
java.lang.ClassNotFoundException