Hadoop(四)小项目练习 更新中

 

一、Hadoop Java API

 静态类实现Mapper类

@Public
@Stable
public class Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT> {
    public Mapper() {
    }

    protected void setup(Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>.Context context) throws IOException, InterruptedException {
    }

    protected void map(KEYIN key, VALUEIN value, Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>.Context context) throws IOException, InterruptedException {
        context.write(key, value);
    }

    protected void cleanup(Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>.Context context) throws IOException, InterruptedException {
    }

    public void run(Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>.Context context) throws IOException, InterruptedException {
        this.setup(context);

        try {
            while(context.nextKeyValue()) {
                this.map(context.getCurrentKey(), context.getCurrentValue(), context);
            }
        } finally {
            this.cleanup(context);
        }

    }

    public abstract class Context implements MapContext<KEYIN, VALUEIN, KEYOUT, VALUEOUT> {
        public Context() {
        }
    }
}

Context的接口

@Public
@Evolving
public interface MapContext<KEYIN, VALUEIN, KEYOUT, VALUEOUT> extends TaskInputOutputContext<KEYIN, VALUEIN, KEYOUT, VALUEOUT> {
    InputSplit getInputSplit();
}

继承了

@Public
@Evolving
public interface TaskInputOutputContext<KEYIN, VALUEIN, KEYOUT, VALUEOUT> extends TaskAttemptContext {
    boolean nextKeyValue() throws IOException, InterruptedException;

    KEYIN getCurrentKey() throws IOException, InterruptedException;

    VALUEIN getCurrentValue() throws IOException, InterruptedException;

    void write(KEYOUT var1, VALUEOUT var2) throws IOException, InterruptedException;

    OutputCommitter getOutputCommitter();
}

继承了

@Public
@Evolving
public interface TaskAttemptContext extends JobContext, Progressable {
    TaskAttemptID getTaskAttemptID();

    void setStatus(String var1);

    String getStatus();

    float getProgress();

    Counter getCounter(Enum<?> var1);

    Counter getCounter(String var1, String var2);
}

继承了

Public
@Evolving
public interface JobContext extends MRJobConfig {
    Configuration getConfiguration();

    Credentials getCredentials();

    JobID getJobID();

    int getNumReduceTasks();

    Path getWorkingDirectory() throws IOException;

    Class<?> getOutputKeyClass();

    Class<?> getOutputValueClass();

    Class<?> getMapOutputKeyClass();

    Class<?> getMapOutputValueClass();

    String getJobName();

    Class<? extends InputFormat<?, ?>> getInputFormatClass() throws ClassNotFoundException;

    Class<? extends Mapper<?, ?, ?, ?>> getMapperClass() throws ClassNotFoundException;

    Class<? extends Reducer<?, ?, ?, ?>> getCombinerClass() throws ClassNotFoundException;

    Class<? extends Reducer<?, ?, ?, ?>> getReducerClass() throws ClassNotFoundException;

    Class<? extends OutputFormat<?, ?>> getOutputFormatClass() throws ClassNotFoundException;

    Class<? extends Partitioner<?, ?>> getPartitionerClass() throws ClassNotFoundException;

    RawComparator<?> getSortComparator();

    String getJar();

    RawComparator<?> getCombinerKeyGroupingComparator();

    RawComparator<?> getGroupingComparator();

    boolean getJobSetupCleanupNeeded();

    boolean getTaskCleanupNeeded();

    boolean getProfileEnabled();

    String getProfileParams();

    IntegerRanges getProfileTaskRange(boolean var1);

    String getUser();

    /** @deprecated */
    @Deprecated
    boolean getSymlink();

    Path[] getArchiveClassPaths();

    URI[] getCacheArchives() throws IOException;

    URI[] getCacheFiles() throws IOException;

    /** @deprecated */
    @Deprecated
    Path[] getLocalCacheArchives() throws IOException;

    /** @deprecated */
    @Deprecated
    Path[] getLocalCacheFiles() throws IOException;

    Path[] getFileClassPaths();

    String[] getArchiveTimestamps();

    String[] getFileTimestamps();

    int getMaxMapAttempts();

    int getMaxReduceAttempts();
}
JobContext
@Private
@Evolving
public interface MRJobConfig {
    String MAP_SORT_CLASS = "map.sort.class";
    String INPUT_FORMAT_CLASS_ATTR = "mapreduce.job.inputformat.class";
    String MAP_CLASS_ATTR = "mapreduce.job.map.class";
    String MAP_OUTPUT_COLLECTOR_CLASS_ATTR = "mapreduce.job.map.output.collector.class";
    String COMBINE_CLASS_ATTR = "mapreduce.job.combine.class";
    String REDUCE_CLASS_ATTR = "mapreduce.job.reduce.class";
    String OUTPUT_FORMAT_CLASS_ATTR = "mapreduce.job.outputformat.class";
    String PARTITIONER_CLASS_ATTR = "mapreduce.job.partitioner.class";
    String SETUP_CLEANUP_NEEDED = "mapreduce.job.committer.setup.cleanup.needed";
    String TASK_CLEANUP_NEEDED = "mapreduce.job.committer.task.cleanup.needed";
    String TASK_LOCAL_WRITE_LIMIT_BYTES = "mapreduce.task.local-fs.write-limit.bytes";
    long DEFAULT_TASK_LOCAL_WRITE_LIMIT_BYTES = -1L;
    String JAR = "mapreduce.job.jar";
    String ID = "mapreduce.job.id";
    String JOB_NAME = "mapreduce.job.name";
    String JAR_UNPACK_PATTERN = "mapreduce.job.jar.unpack.pattern";
    String USER_NAME = "mapreduce.job.user.name";
    String PRIORITY = "mapreduce.job.priority";
    String QUEUE_NAME = "mapreduce.job.queuename";
    String JOB_NODE_LABEL_EXP = "mapreduce.job.node-label-expression";
    String AM_NODE_LABEL_EXP = "mapreduce.job.am.node-label-expression";
    String MAP_NODE_LABEL_EXP = "mapreduce.map.node-label-expression";
    String REDUCE_NODE_LABEL_EXP = "mapreduce.reduce.node-label-expression";
    String AM_STRICT_LOCALITY = "mapreduce.job.am.strict-locality";
    String RESERVATION_ID = "mapreduce.job.reservation.id";
    String JOB_TAGS = "mapreduce.job.tags";
    String JVM_NUMTASKS_TORUN = "mapreduce.job.jvm.numtasks";
    String SPLIT_FILE = "mapreduce.job.splitfile";
    String SPLIT_METAINFO_MAXSIZE = "mapreduce.job.split.metainfo.maxsize";
    long DEFAULT_SPLIT_METAINFO_MAXSIZE = 10000000L;
    String NUM_MAPS = "mapreduce.job.maps";
    String MAX_TASK_FAILURES_PER_TRACKER = "mapreduce.job.maxtaskfailures.per.tracker";
    String COMPLETED_MAPS_FOR_REDUCE_SLOWSTART = "mapreduce.job.reduce.slowstart.completedmaps";
    String NUM_REDUCES = "mapreduce.job.reduces";
    String SKIP_RECORDS = "mapreduce.job.skiprecords";
    String SKIP_OUTDIR = "mapreduce.job.skip.outdir";
    /** @deprecated */
    @Deprecated
    String SPECULATIVE_SLOWNODE_THRESHOLD = "mapreduce.job.speculative.slownodethreshold";
    String SPECULATIVE_SLOWTASK_THRESHOLD = "mapreduce.job.speculative.slowtaskthreshold";
    /** @deprecated */
    @Deprecated
    String SPECULATIVECAP = "mapreduce.job.speculative.speculativecap";
    String SPECULATIVECAP_RUNNING_TASKS = "mapreduce.job.speculative.speculative-cap-running-tasks";
    double DEFAULT_SPECULATIVECAP_RUNNING_TASKS = 0.1D;
    String SPECULATIVECAP_TOTAL_TASKS = "mapreduce.job.speculative.speculative-cap-total-tasks";
    double DEFAULT_SPECULATIVECAP_TOTAL_TASKS = 0.01D;
    String SPECULATIVE_MINIMUM_ALLOWED_TASKS = "mapreduce.job.speculative.minimum-allowed-tasks";
    int DEFAULT_SPECULATIVE_MINIMUM_ALLOWED_TASKS = 10;
    String SPECULATIVE_RETRY_AFTER_NO_SPECULATE = "mapreduce.job.speculative.retry-after-no-speculate";
    long DEFAULT_SPECULATIVE_RETRY_AFTER_NO_SPECULATE = 1000L;
    String SPECULATIVE_RETRY_AFTER_SPECULATE = "mapreduce.job.speculative.retry-after-speculate";
    long DEFAULT_SPECULATIVE_RETRY_AFTER_SPECULATE = 15000L;
    String JOB_LOCAL_DIR = "mapreduce.job.local.dir";
    String OUTPUT_KEY_CLASS = "mapreduce.job.output.key.class";
    String OUTPUT_VALUE_CLASS = "mapreduce.job.output.value.class";
    String KEY_COMPARATOR = "mapreduce.job.output.key.comparator.class";
    String COMBINER_GROUP_COMPARATOR_CLASS = "mapreduce.job.combiner.group.comparator.class";
    String GROUP_COMPARATOR_CLASS = "mapreduce.job.output.group.comparator.class";
    String WORKING_DIR = "mapreduce.job.working.dir";
    String CLASSPATH_ARCHIVES = "mapreduce.job.classpath.archives";
    String CLASSPATH_FILES = "mapreduce.job.classpath.files";
    String CACHE_FILES = "mapreduce.job.cache.files";
    String CACHE_ARCHIVES = "mapreduce.job.cache.archives";
    String CACHE_FILES_SIZES = "mapreduce.job.cache.files.filesizes";
    String CACHE_ARCHIVES_SIZES = "mapreduce.job.cache.archives.filesizes";
    String CACHE_LOCALFILES = "mapreduce.job.cache.local.files";
    String CACHE_LOCALARCHIVES = "mapreduce.job.cache.local.archives";
    String CACHE_FILE_TIMESTAMPS = "mapreduce.job.cache.files.timestamps";
    String CACHE_ARCHIVES_TIMESTAMPS = "mapreduce.job.cache.archives.timestamps";
    String CACHE_FILE_VISIBILITIES = "mapreduce.job.cache.files.visibilities";
    String CACHE_ARCHIVES_VISIBILITIES = "mapreduce.job.cache.archives.visibilities";
    String JOBJAR_VISIBILITY = "mapreduce.job.jobjar.visibility";
    boolean JOBJAR_VISIBILITY_DEFAULT = false;
    String JOBJAR_SHARED_CACHE_UPLOAD_POLICY = "mapreduce.job.jobjar.sharedcache.uploadpolicy";
    boolean JOBJAR_SHARED_CACHE_UPLOAD_POLICY_DEFAULT = false;
    String CACHE_FILES_SHARED_CACHE_UPLOAD_POLICIES = "mapreduce.job.cache.files.sharedcache.uploadpolicies";
    String CACHE_ARCHIVES_SHARED_CACHE_UPLOAD_POLICIES = "mapreduce.job.cache.archives.sharedcache.uploadpolicies";
    String FILES_FOR_SHARED_CACHE = "mapreduce.job.cache.sharedcache.files";
    String FILES_FOR_CLASSPATH_AND_SHARED_CACHE = "mapreduce.job.cache.sharedcache.files.addtoclasspath";
    String ARCHIVES_FOR_SHARED_CACHE = "mapreduce.job.cache.sharedcache.archives";
    String SHARED_CACHE_MODE = "mapreduce.job.sharedcache.mode";
    String SHARED_CACHE_MODE_DEFAULT = "disabled";
    /** @deprecated */
    @Deprecated
    String CACHE_SYMLINK = "mapreduce.job.cache.symlink.create";
    String USER_LOG_RETAIN_HOURS = "mapreduce.job.userlog.retain.hours";
    String MAPREDUCE_JOB_USER_CLASSPATH_FIRST = "mapreduce.job.user.classpath.first";
    String MAPREDUCE_JOB_CLASSLOADER = "mapreduce.job.classloader";
    String MAPREDUCE_JOB_SHUFFLE_PROVIDER_SERVICES = "mapreduce.job.shuffle.provider.services";
    String MAPREDUCE_JOB_CLASSLOADER_SYSTEM_CLASSES = "mapreduce.job.classloader.system.classes";
    String MAPREDUCE_JVM_SYSTEM_PROPERTIES_TO_LOG = "mapreduce.jvm.system-properties-to-log";
    String DEFAULT_MAPREDUCE_JVM_SYSTEM_PROPERTIES_TO_LOG = "os.name,os.version,java.home,java.runtime.version,java.vendor,java.version,java.vm.name,java.class.path,java.io.tmpdir,user.dir,user.name";
    String IO_SORT_FACTOR = "mapreduce.task.io.sort.factor";
    int DEFAULT_IO_SORT_FACTOR = 10;
    String IO_SORT_MB = "mapreduce.task.io.sort.mb";
    int DEFAULT_IO_SORT_MB = 100;
    String INDEX_CACHE_MEMORY_LIMIT = "mapreduce.task.index.cache.limit.bytes";
    String PRESERVE_FAILED_TASK_FILES = "mapreduce.task.files.preserve.failedtasks";
    String PRESERVE_FILES_PATTERN = "mapreduce.task.files.preserve.filepattern";
    String TASK_DEBUGOUT_LINES = "mapreduce.task.debugout.lines";
    String RECORDS_BEFORE_PROGRESS = "mapreduce.task.merge.progress.records";
    String SKIP_START_ATTEMPTS = "mapreduce.task.skip.start.attempts";
    String TASK_ATTEMPT_ID = "mapreduce.task.attempt.id";
    String TASK_ISMAP = "mapreduce.task.ismap";
    boolean DEFAULT_TASK_ISMAP = true;
    String TASK_PARTITION = "mapreduce.task.partition";
    String TASK_PROFILE = "mapreduce.task.profile";
    String TASK_PROFILE_PARAMS = "mapreduce.task.profile.params";
    String DEFAULT_TASK_PROFILE_PARAMS = "-agentlib:hprof=cpu=samples,heap=sites,force=n,thread=y,verbose=n,file=%s";
    String NUM_MAP_PROFILES = "mapreduce.task.profile.maps";
    String NUM_REDUCE_PROFILES = "mapreduce.task.profile.reduces";
    String TASK_MAP_PROFILE_PARAMS = "mapreduce.task.profile.map.params";
    String TASK_REDUCE_PROFILE_PARAMS = "mapreduce.task.profile.reduce.params";
    String TASK_TIMEOUT = "mapreduce.task.timeout";
    long DEFAULT_TASK_TIMEOUT_MILLIS = 300000L;
    String TASK_PROGRESS_REPORT_INTERVAL = "mapreduce.task.progress-report.interval";
    String TASK_TIMEOUT_CHECK_INTERVAL_MS = "mapreduce.task.timeout.check-interval-ms";
    String TASK_EXIT_TIMEOUT = "mapreduce.task.exit.timeout";
    int TASK_EXIT_TIMEOUT_DEFAULT = 60000;
    String TASK_EXIT_TIMEOUT_CHECK_INTERVAL_MS = "mapreduce.task.exit.timeout.check-interval-ms";
    int TASK_EXIT_TIMEOUT_CHECK_INTERVAL_MS_DEFAULT = 20000;
    String TASK_ID = "mapreduce.task.id";
    String TASK_OUTPUT_DIR = "mapreduce.task.output.dir";
    String TASK_USERLOG_LIMIT = "mapreduce.task.userlog.limit.kb";
    String MAP_SORT_SPILL_PERCENT = "mapreduce.map.sort.spill.percent";
    String MAP_INPUT_FILE = "mapreduce.map.input.file";
    String MAP_INPUT_PATH = "mapreduce.map.input.length";
    String MAP_INPUT_START = "mapreduce.map.input.start";
    String MAP_MEMORY_MB = "mapreduce.map.memory.mb";
    int DEFAULT_MAP_MEMORY_MB = 1024;
    String MAP_CPU_VCORES = "mapreduce.map.cpu.vcores";
    int DEFAULT_MAP_CPU_VCORES = 1;
    String MAP_RESOURCE_TYPE_PREFIX = "mapreduce.map.resource.";
    String RESOURCE_TYPE_NAME_VCORE = "vcores";
    String RESOURCE_TYPE_NAME_MEMORY = "memory";
    String RESOURCE_TYPE_ALTERNATIVE_NAME_MEMORY = "memory-mb";
    String MAP_ENV = "mapreduce.map.env";
    String MAP_JAVA_OPTS = "mapreduce.map.java.opts";
    String MAP_MAX_ATTEMPTS = "mapreduce.map.maxattempts";
    String MAP_DEBUG_SCRIPT = "mapreduce.map.debug.script";
    String MAP_SPECULATIVE = "mapreduce.map.speculative";
    String MAP_FAILURES_MAX_PERCENT = "mapreduce.map.failures.maxpercent";
    String MAP_SKIP_INCR_PROC_COUNT = "mapreduce.map.skip.proc-count.auto-incr";
    String MAP_SKIP_MAX_RECORDS = "mapreduce.map.skip.maxrecords";
    String MAP_COMBINE_MIN_SPILLS = "mapreduce.map.combine.minspills";
    String MAP_OUTPUT_COMPRESS = "mapreduce.map.output.compress";
    String MAP_OUTPUT_COMPRESS_CODEC = "mapreduce.map.output.compress.codec";
    String MAP_OUTPUT_KEY_CLASS = "mapreduce.map.output.key.class";
    String MAP_OUTPUT_VALUE_CLASS = "mapreduce.map.output.value.class";
    String MAP_OUTPUT_KEY_FIELD_SEPARATOR = "mapreduce.map.output.key.field.separator";
    /** @deprecated */
    @Deprecated
    String MAP_OUTPUT_KEY_FIELD_SEPERATOR = "mapreduce.map.output.key.field.separator";
    String MAP_LOG_LEVEL = "mapreduce.map.log.level";
    String REDUCE_LOG_LEVEL = "mapreduce.reduce.log.level";
    String DEFAULT_LOG_LEVEL = "INFO";
    String REDUCE_MERGE_INMEM_THRESHOLD = "mapreduce.reduce.merge.inmem.threshold";
    String REDUCE_INPUT_BUFFER_PERCENT = "mapreduce.reduce.input.buffer.percent";
    String REDUCE_MARKRESET_BUFFER_PERCENT = "mapreduce.reduce.markreset.buffer.percent";
    String REDUCE_MARKRESET_BUFFER_SIZE = "mapreduce.reduce.markreset.buffer.size";
    String REDUCE_MEMORY_MB = "mapreduce.reduce.memory.mb";
    int DEFAULT_REDUCE_MEMORY_MB = 1024;
    String REDUCE_CPU_VCORES = "mapreduce.reduce.cpu.vcores";
    int DEFAULT_REDUCE_CPU_VCORES = 1;
    String REDUCE_RESOURCE_TYPE_PREFIX = "mapreduce.reduce.resource.";
    String REDUCE_MEMORY_TOTAL_BYTES = "mapreduce.reduce.memory.totalbytes";
    String SHUFFLE_INPUT_BUFFER_PERCENT = "mapreduce.reduce.shuffle.input.buffer.percent";
    float DEFAULT_SHUFFLE_INPUT_BUFFER_PERCENT = 0.7F;
    String SHUFFLE_MEMORY_LIMIT_PERCENT = "mapreduce.reduce.shuffle.memory.limit.percent";
    String SHUFFLE_MERGE_PERCENT = "mapreduce.reduce.shuffle.merge.percent";
    float DEFAULT_SHUFFLE_MERGE_PERCENT = 0.66F;
    String REDUCE_FAILURES_MAXPERCENT = "mapreduce.reduce.failures.maxpercent";
    String REDUCE_ENV = "mapreduce.reduce.env";
    String REDUCE_JAVA_OPTS = "mapreduce.reduce.java.opts";
    String MAPREDUCE_JOB_DIR = "mapreduce.job.dir";
    String REDUCE_MAX_ATTEMPTS = "mapreduce.reduce.maxattempts";
    String SHUFFLE_PARALLEL_COPIES = "mapreduce.reduce.shuffle.parallelcopies";
    String REDUCE_DEBUG_SCRIPT = "mapreduce.reduce.debug.script";
    String REDUCE_SPECULATIVE = "mapreduce.reduce.speculative";
    String SHUFFLE_CONNECT_TIMEOUT = "mapreduce.reduce.shuffle.connect.timeout";
    String SHUFFLE_READ_TIMEOUT = "mapreduce.reduce.shuffle.read.timeout";
    String SHUFFLE_FETCH_FAILURES = "mapreduce.reduce.shuffle.maxfetchfailures";
    String MAX_ALLOWED_FETCH_FAILURES_FRACTION = "mapreduce.reduce.shuffle.max-fetch-failures-fraction";
    float DEFAULT_MAX_ALLOWED_FETCH_FAILURES_FRACTION = 0.5F;
    String MAX_FETCH_FAILURES_NOTIFICATIONS = "mapreduce.reduce.shuffle.max-fetch-failures-notifications";
    int DEFAULT_MAX_FETCH_FAILURES_NOTIFICATIONS = 3;
    String SHUFFLE_FETCH_RETRY_INTERVAL_MS = "mapreduce.reduce.shuffle.fetch.retry.interval-ms";
    int DEFAULT_SHUFFLE_FETCH_RETRY_INTERVAL_MS = 1000;
    String SHUFFLE_FETCH_RETRY_TIMEOUT_MS = "mapreduce.reduce.shuffle.fetch.retry.timeout-ms";
    String SHUFFLE_FETCH_RETRY_ENABLED = "mapreduce.reduce.shuffle.fetch.retry.enabled";
    String SHUFFLE_NOTIFY_READERROR = "mapreduce.reduce.shuffle.notify.readerror";
    String MAX_SHUFFLE_FETCH_RETRY_DELAY = "mapreduce.reduce.shuffle.retry-delay.max.ms";
    long DEFAULT_MAX_SHUFFLE_FETCH_RETRY_DELAY = 60000L;
    String MAX_SHUFFLE_FETCH_HOST_FAILURES = "mapreduce.reduce.shuffle.max-host-failures";
    int DEFAULT_MAX_SHUFFLE_FETCH_HOST_FAILURES = 5;
    String REDUCE_SKIP_INCR_PROC_COUNT = "mapreduce.reduce.skip.proc-count.auto-incr";
    String REDUCE_SKIP_MAXGROUPS = "mapreduce.reduce.skip.maxgroups";
    String REDUCE_MEMTOMEM_THRESHOLD = "mapreduce.reduce.merge.memtomem.threshold";
    String REDUCE_MEMTOMEM_ENABLED = "mapreduce.reduce.merge.memtomem.enabled";
    String COMBINE_RECORDS_BEFORE_PROGRESS = "mapreduce.task.combine.progress.records";
    String JOB_NAMENODES = "mapreduce.job.hdfs-servers";
    String JOB_NAMENODES_TOKEN_RENEWAL_EXCLUDE = "mapreduce.job.hdfs-servers.token-renewal.exclude";
    String JOB_JOBTRACKER_ID = "mapreduce.job.kerberos.jtprinicipal";
    String JOB_CANCEL_DELEGATION_TOKEN = "mapreduce.job.complete.cancel.delegation.tokens";
    String JOB_ACL_VIEW_JOB = "mapreduce.job.acl-view-job";
    String DEFAULT_JOB_ACL_VIEW_JOB = " ";
    String JOB_ACL_MODIFY_JOB = "mapreduce.job.acl-modify-job";
    String DEFAULT_JOB_ACL_MODIFY_JOB = " ";
    String JOB_RUNNING_MAP_LIMIT = "mapreduce.job.running.map.limit";
    int DEFAULT_JOB_RUNNING_MAP_LIMIT = 0;
    String JOB_RUNNING_REDUCE_LIMIT = "mapreduce.job.running.reduce.limit";
    int DEFAULT_JOB_RUNNING_REDUCE_LIMIT = 0;
    String JOB_MAX_MAP = "mapreduce.job.max.map";
    int DEFAULT_JOB_MAX_MAP = -1;
    String MAPREDUCE_JOB_CREDENTIALS_BINARY = "mapreduce.job.credentials.binary";
    String JOB_TOKEN_TRACKING_IDS_ENABLED = "mapreduce.job.token.tracking.ids.enabled";
    boolean DEFAULT_JOB_TOKEN_TRACKING_IDS_ENABLED = false;
    String JOB_TOKEN_TRACKING_IDS = "mapreduce.job.token.tracking.ids";
    String JOB_SUBMITHOST = "mapreduce.job.submithostname";
    String JOB_SUBMITHOSTADDR = "mapreduce.job.submithostaddress";
    String COUNTERS_MAX_KEY = "mapreduce.job.counters.max";
    int COUNTERS_MAX_DEFAULT = 120;
    String COUNTER_GROUP_NAME_MAX_KEY = "mapreduce.job.counters.group.name.max";
    int COUNTER_GROUP_NAME_MAX_DEFAULT = 128;
    String COUNTER_NAME_MAX_KEY = "mapreduce.job.counters.counter.name.max";
    int COUNTER_NAME_MAX_DEFAULT = 64;
    String COUNTER_GROUPS_MAX_KEY = "mapreduce.job.counters.groups.max";
    int COUNTER_GROUPS_MAX_DEFAULT = 50;
    String JOB_UBERTASK_ENABLE = "mapreduce.job.ubertask.enable";
    String JOB_UBERTASK_MAXMAPS = "mapreduce.job.ubertask.maxmaps";
    String JOB_UBERTASK_MAXREDUCES = "mapreduce.job.ubertask.maxreduces";
    String JOB_UBERTASK_MAXBYTES = "mapreduce.job.ubertask.maxbytes";
    String MAPREDUCE_JOB_EMIT_TIMELINE_DATA = "mapreduce.job.emit-timeline-data";
    boolean DEFAULT_MAPREDUCE_JOB_EMIT_TIMELINE_DATA = false;
    String MR_PREFIX = "yarn.app.mapreduce.";
    String MR_AM_PREFIX = "yarn.app.mapreduce.am.";
    String MR_CLIENT_TO_AM_IPC_MAX_RETRIES = "yarn.app.mapreduce.client-am.ipc.max-retries";
    int DEFAULT_MR_CLIENT_TO_AM_IPC_MAX_RETRIES = 3;
    String MR_CLIENT_TO_AM_IPC_MAX_RETRIES_ON_TIMEOUTS = "yarn.app.mapreduce.client-am.ipc.max-retries-on-timeouts";
    int DEFAULT_MR_CLIENT_TO_AM_IPC_MAX_RETRIES_ON_TIMEOUTS = 3;
    String MR_CLIENT_MAX_RETRIES = "yarn.app.mapreduce.client.max-retries";
    int DEFAULT_MR_CLIENT_MAX_RETRIES = 3;
    String MR_CLIENT_JOB_MAX_RETRIES = "yarn.app.mapreduce.client.job.max-retries";
    int DEFAULT_MR_CLIENT_JOB_MAX_RETRIES = 3;
    String MR_CLIENT_JOB_RETRY_INTERVAL = "yarn.app.mapreduce.client.job.retry-interval";
    long DEFAULT_MR_CLIENT_JOB_RETRY_INTERVAL = 2000L;
    String MR_AM_STAGING_DIR = "yarn.app.mapreduce.am.staging-dir";
    String DEFAULT_MR_AM_STAGING_DIR = "/tmp/hadoop-yarn/staging";
    String MR_AM_VMEM_MB = "yarn.app.mapreduce.am.resource.mb";
    int DEFAULT_MR_AM_VMEM_MB = 1536;
    String MR_AM_CPU_VCORES = "yarn.app.mapreduce.am.resource.cpu-vcores";
    int DEFAULT_MR_AM_CPU_VCORES = 1;
    String MR_AM_RESOURCE_PREFIX = "yarn.app.mapreduce.am.resource.";
    String MR_AM_COMMAND_OPTS = "yarn.app.mapreduce.am.command-opts";
    String DEFAULT_MR_AM_COMMAND_OPTS = "-Xmx1024m";
    String MR_AM_ADMIN_COMMAND_OPTS = "yarn.app.mapreduce.am.admin-command-opts";
    String DEFAULT_MR_AM_ADMIN_COMMAND_OPTS = "";
    String MR_AM_LOG_LEVEL = "yarn.app.mapreduce.am.log.level";
    String DEFAULT_MR_AM_LOG_LEVEL = "INFO";
    String MR_AM_LOG_KB = "yarn.app.mapreduce.am.container.log.limit.kb";
    int DEFAULT_MR_AM_LOG_KB = 0;
    String MR_AM_LOG_BACKUPS = "yarn.app.mapreduce.am.container.log.backups";
    int DEFAULT_MR_AM_LOG_BACKUPS = 0;
    String MR_AM_NUM_PROGRESS_SPLITS = "yarn.app.mapreduce.am.num-progress-splits";
    int DEFAULT_MR_AM_NUM_PROGRESS_SPLITS = 12;
    String MR_AM_CONTAINERLAUNCHER_THREAD_COUNT_LIMIT = "yarn.app.mapreduce.am.containerlauncher.thread-count-limit";
    int DEFAULT_MR_AM_CONTAINERLAUNCHER_THREAD_COUNT_LIMIT = 500;
    String MR_AM_CONTAINERLAUNCHER_THREADPOOL_INITIAL_SIZE = "yarn.app.mapreduce.am.containerlauncher.threadpool-initial-size";
    int DEFAULT_MR_AM_CONTAINERLAUNCHER_THREADPOOL_INITIAL_SIZE = 10;
    String MR_AM_JOB_CLIENT_THREAD_COUNT = "yarn.app.mapreduce.am.job.client.thread-count";
    int DEFAULT_MR_AM_JOB_CLIENT_THREAD_COUNT = 1;
    String MR_AM_JOB_CLIENT_PORT_RANGE = "yarn.app.mapreduce.am.job.client.port-range";
    String MR_AM_WEBAPP_PORT_RANGE = "yarn.app.mapreduce.am.webapp.port-range";
    String MR_AM_JOB_NODE_BLACKLISTING_ENABLE = "yarn.app.mapreduce.am.job.node-blacklisting.enable";
    String MR_AM_IGNORE_BLACKLISTING_BLACKLISTED_NODE_PERECENT = "yarn.app.mapreduce.am.job.node-blacklisting.ignore-threshold-node-percent";
    int DEFAULT_MR_AM_IGNORE_BLACKLISTING_BLACKLISTED_NODE_PERCENT = 33;
    String MR_AM_JOB_RECOVERY_ENABLE = "yarn.app.mapreduce.am.job.recovery.enable";
    boolean MR_AM_JOB_RECOVERY_ENABLE_DEFAULT = true;
    String MR_AM_JOB_REDUCE_PREEMPTION_LIMIT = "yarn.app.mapreduce.am.job.reduce.preemption.limit";
    float DEFAULT_MR_AM_JOB_REDUCE_PREEMPTION_LIMIT = 0.5F;
    String MR_AM_PREEMPTION_POLICY = "yarn.app.mapreduce.am.preemption.policy";
    String JOB_AM_ACCESS_DISABLED = "mapreduce.job.am-access-disabled";
    boolean DEFAULT_JOB_AM_ACCESS_DISABLED = false;
    String MR_AM_JOB_REDUCE_RAMPUP_UP_LIMIT = "yarn.app.mapreduce.am.job.reduce.rampup.limit";
    float DEFAULT_MR_AM_JOB_REDUCE_RAMP_UP_LIMIT = 0.5F;
    String MR_AM_JOB_SPECULATOR = "yarn.app.mapreduce.am.job.speculator.class";
    String MR_AM_TASK_ESTIMATOR = "yarn.app.mapreduce.am.job.task.estimator.class";
    String MR_AM_TASK_ESTIMATOR_SMOOTH_LAMBDA_MS = "yarn.app.mapreduce.am.job.task.estimator.exponential.smooth.lambda-ms";
    long DEFAULT_MR_AM_TASK_ESTIMATOR_SMOOTH_LAMBDA_MS = 60000L;
    String MR_AM_TASK_ESTIMATOR_EXPONENTIAL_RATE_ENABLE = "yarn.app.mapreduce.am.job.task.estimator.exponential.smooth.rate";
    String MR_AM_TASK_LISTENER_THREAD_COUNT = "yarn.app.mapreduce.am.job.task.listener.thread-count";
    int DEFAULT_MR_AM_TASK_LISTENER_THREAD_COUNT = 30;
    String MR_AM_TO_RM_HEARTBEAT_INTERVAL_MS = "yarn.app.mapreduce.am.scheduler.heartbeat.interval-ms";
    int DEFAULT_MR_AM_TO_RM_HEARTBEAT_INTERVAL_MS = 1000;
    String MR_AM_TO_RM_WAIT_INTERVAL_MS = "yarn.app.mapreduce.am.scheduler.connection.wait.interval-ms";
    int DEFAULT_MR_AM_TO_RM_WAIT_INTERVAL_MS = 360000;
    String MR_AM_COMMITTER_CANCEL_TIMEOUT_MS = "yarn.app.mapreduce.am.job.committer.cancel-timeout";
    int DEFAULT_MR_AM_COMMITTER_CANCEL_TIMEOUT_MS = 60000;
    String MR_AM_COMMIT_WINDOW_MS = "yarn.app.mapreduce.am.job.committer.commit-window";
    int DEFAULT_MR_AM_COMMIT_WINDOW_MS = 10000;
    String MR_AM_CREATE_JH_INTERMEDIATE_BASE_DIR = "yarn.app.mapreduce.am.create-intermediate-jh-base-dir";
    String MR_AM_HISTORY_MAX_UNFLUSHED_COMPLETE_EVENTS = "yarn.app.mapreduce.am.history.max-unflushed-events";
    int DEFAULT_MR_AM_HISTORY_MAX_UNFLUSHED_COMPLETE_EVENTS = 200;
    String MR_AM_HISTORY_JOB_COMPLETE_UNFLUSHED_MULTIPLIER = "yarn.app.mapreduce.am.history.job-complete-unflushed-multiplier";
    int DEFAULT_MR_AM_HISTORY_JOB_COMPLETE_UNFLUSHED_MULTIPLIER = 30;
    String MR_AM_HISTORY_COMPLETE_EVENT_FLUSH_TIMEOUT_MS = "yarn.app.mapreduce.am.history.complete-event-flush-timeout";
    long DEFAULT_MR_AM_HISTORY_COMPLETE_EVENT_FLUSH_TIMEOUT_MS = 30000L;
    String MR_AM_HISTORY_USE_BATCHED_FLUSH_QUEUE_SIZE_THRESHOLD = "yarn.app.mapreduce.am.history.use-batched-flush.queue-size.threshold";
    int DEFAULT_MR_AM_HISTORY_USE_BATCHED_FLUSH_QUEUE_SIZE_THRESHOLD = 50;
    String MR_AM_HARD_KILL_TIMEOUT_MS = "yarn.app.mapreduce.am.hard-kill-timeout-ms";
    long DEFAULT_MR_AM_HARD_KILL_TIMEOUT_MS = 10000L;
    String MR_JOB_REDUCER_UNCONDITIONAL_PREEMPT_DELAY_SEC = "mapreduce.job.reducer.unconditional-preempt.delay.sec";
    int DEFAULT_MR_JOB_REDUCER_UNCONDITIONAL_PREEMPT_DELAY_SEC = 300;
    String MR_JOB_REDUCER_PREEMPT_DELAY_SEC = "mapreduce.job.reducer.preempt.delay.sec";
    int DEFAULT_MR_JOB_REDUCER_PREEMPT_DELAY_SEC = 0;
    String MR_AM_ENV = "yarn.app.mapreduce.am.env";
    String MR_AM_ADMIN_USER_ENV = "yarn.app.mapreduce.am.admin.user.env";
    String DEFAULT_MR_AM_ADMIN_USER_ENV = Shell.WINDOWS ? "" : "LD_LIBRARY_PATH=" + Apps.crossPlatformify("HADOOP_COMMON_HOME") + "/lib/native";
    String MR_AM_PROFILE = "yarn.app.mapreduce.am.profile";
    boolean DEFAULT_MR_AM_PROFILE = false;
    String MR_AM_PROFILE_PARAMS = "yarn.app.mapreduce.am.profile.params";
    String MAPRED_MAP_ADMIN_JAVA_OPTS = "mapreduce.admin.map.child.java.opts";
    String MAPRED_REDUCE_ADMIN_JAVA_OPTS = "mapreduce.admin.reduce.child.java.opts";
    String DEFAULT_MAPRED_ADMIN_JAVA_OPTS = "-Djava.net.preferIPv4Stack=true -Dhadoop.metrics.log.level=WARN ";
    String MAPRED_ADMIN_USER_SHELL = "mapreduce.admin.user.shell";
    String DEFAULT_SHELL = "/bin/bash";
    String MAPRED_ADMIN_USER_ENV = "mapreduce.admin.user.env";
    String DEFAULT_MAPRED_ADMIN_USER_ENV = Shell.WINDOWS ? "PATH=%PATH%;%HADOOP_COMMON_HOME%\\bin" : "LD_LIBRARY_PATH=" + Apps.crossPlatformify("HADOOP_COMMON_HOME") + "/lib/native";
    String WORKDIR = "work";
    String OUTPUT = "output";
    String HADOOP_WORK_DIR = "HADOOP_WORK_DIR";
    String STDOUT_LOGFILE_ENV = "STDOUT_LOGFILE_ENV";
    String STDERR_LOGFILE_ENV = "STDERR_LOGFILE_ENV";
    String JOB_SUBMIT_DIR = "jobSubmitDir";
    String JOB_CONF_FILE = "job.xml";
    String JOB_JAR = "job.jar";
    String JOB_SPLIT = "job.split";
    String JOB_SPLIT_METAINFO = "job.splitmetainfo";
    String APPLICATION_MASTER_CLASS = "org.apache.hadoop.mapreduce.v2.app.MRAppMaster";
    String MAPREDUCE_V2_CHILD_CLASS = "org.apache.hadoop.mapred.YarnChild";
    String APPLICATION_ATTEMPT_ID = "mapreduce.job.application.attempt.id";
    String MR_JOB_END_NOTIFICATION_URL = "mapreduce.job.end-notification.url";
    String MR_JOB_END_NOTIFICATION_PROXY = "mapreduce.job.end-notification.proxy";
    String MR_JOB_END_NOTIFICATION_TIMEOUT = "mapreduce.job.end-notification.timeout";
    String MR_JOB_END_RETRY_ATTEMPTS = "mapreduce.job.end-notification.retry.attempts";
    String MR_JOB_END_RETRY_INTERVAL = "mapreduce.job.end-notification.retry.interval";
    String MR_JOB_END_NOTIFICATION_MAX_ATTEMPTS = "mapreduce.job.end-notification.max.attempts";
    String MR_JOB_END_NOTIFICATION_MAX_RETRY_INTERVAL = "mapreduce.job.end-notification.max.retry.interval";
    int DEFAULT_MR_JOB_END_NOTIFICATION_TIMEOUT = 5000;
    String MR_AM_SECURITY_SERVICE_AUTHORIZATION_TASK_UMBILICAL = "security.job.task.protocol.acl";
    String MR_AM_SECURITY_SERVICE_AUTHORIZATION_CLIENT = "security.job.client.protocol.acl";
    String MAPREDUCE_APPLICATION_CLASSPATH = "mapreduce.application.classpath";
    String MAPREDUCE_JOB_LOG4J_PROPERTIES_FILE = "mapreduce.job.log4j-properties-file";
    String MAPREDUCE_APPLICATION_FRAMEWORK_PATH = "mapreduce.application.framework.path";
    @Public
    @Unstable
    String DEFAULT_MAPREDUCE_CROSS_PLATFORM_APPLICATION_CLASSPATH = Apps.crossPlatformify("HADOOP_MAPRED_HOME") + "/share/hadoop/mapreduce/*," + Apps.crossPlatformify("HADOOP_MAPRED_HOME") + "/share/hadoop/mapreduce/lib/*";
    String DEFAULT_MAPREDUCE_APPLICATION_CLASSPATH = Shell.WINDOWS ? "%HADOOP_MAPRED_HOME%\\share\\hadoop\\mapreduce\\*,%HADOOP_MAPRED_HOME%\\share\\hadoop\\mapreduce\\lib\\*" : "$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*,$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*";
    String WORKFLOW_ID = "mapreduce.workflow.id";
    String TASK_LOG_BACKUPS = "yarn.app.mapreduce.task.container.log.backups";
    int DEFAULT_TASK_LOG_BACKUPS = 0;
    String REDUCE_SEPARATE_SHUFFLE_LOG = "yarn.app.mapreduce.shuffle.log.separate";
    boolean DEFAULT_REDUCE_SEPARATE_SHUFFLE_LOG = true;
    String SHUFFLE_LOG_BACKUPS = "yarn.app.mapreduce.shuffle.log.backups";
    int DEFAULT_SHUFFLE_LOG_BACKUPS = 0;
    String SHUFFLE_LOG_KB = "yarn.app.mapreduce.shuffle.log.limit.kb";
    long DEFAULT_SHUFFLE_LOG_KB = 0L;
    String WORKFLOW_NAME = "mapreduce.workflow.name";
    String WORKFLOW_NODE_NAME = "mapreduce.workflow.node.name";
    String WORKFLOW_ADJACENCY_PREFIX_STRING = "mapreduce.workflow.adjacency.";
    String WORKFLOW_ADJACENCY_PREFIX_PATTERN = "^mapreduce\\.workflow\\.adjacency\\..+";
    String WORKFLOW_TAGS = "mapreduce.workflow.tags";
    String MR_AM_MAX_ATTEMPTS = "mapreduce.am.max-attempts";
    int DEFAULT_MR_AM_MAX_ATTEMPTS = 2;
    String MR_APPLICATION_TYPE = "MAPREDUCE";
    String TASK_PREEMPTION = "mapreduce.job.preemption";
    String HEAP_MEMORY_MB_RATIO = "mapreduce.job.heap.memory-mb.ratio";
    float DEFAULT_HEAP_MEMORY_MB_RATIO = 0.8F;
    String MR_ENCRYPTED_INTERMEDIATE_DATA = "mapreduce.job.encrypted-intermediate-data";
    boolean DEFAULT_MR_ENCRYPTED_INTERMEDIATE_DATA = false;
    String MR_ENCRYPTED_INTERMEDIATE_DATA_KEY_SIZE_BITS = "mapreduce.job.encrypted-intermediate-data-key-size-bits";
    int DEFAULT_MR_ENCRYPTED_INTERMEDIATE_DATA_KEY_SIZE_BITS = 128;
    String MR_ENCRYPTED_INTERMEDIATE_DATA_BUFFER_KB = "mapreduce.job.encrypted-intermediate-data.buffer.kb";
    int DEFAULT_MR_ENCRYPTED_INTERMEDIATE_DATA_BUFFER_KB = 128;
    String MAX_RESOURCES = "mapreduce.job.cache.limit.max-resources";
    int MAX_RESOURCES_DEFAULT = 0;
    String MAX_RESOURCES_MB = "mapreduce.job.cache.limit.max-resources-mb";
    long MAX_RESOURCES_MB_DEFAULT = 0L;
    String MAX_SINGLE_RESOURCE_MB = "mapreduce.job.cache.limit.max-single-resource-mb";
    long MAX_SINGLE_RESOURCE_MB_DEFAULT = 0L;
    String MR_NUM_OPPORTUNISTIC_MAPS_PERCENT = "mapreduce.job.num-opportunistic-maps-percent";
    int DEFAULT_MR_NUM_OPPORTUNISTIC_MAPS_PERCENT = 0;
    String MR_JOB_REDACTED_PROPERTIES = "mapreduce.job.redacted-properties";
    String MR_JOB_SEND_TOKEN_CONF = "mapreduce.job.send-token-conf";
    String FINISH_JOB_WHEN_REDUCERS_DONE = "mapreduce.job.finish-when-all-reducers-done";
    boolean DEFAULT_FINISH_JOB_WHEN_REDUCERS_DONE = true;
    String MR_AM_STAGING_DIR_ERASURECODING_ENABLED = "yarn.app.mapreduce.am.staging-dir.erasurecoding.enabled";
    boolean DEFAULT_MR_AM_STAGING_ERASURECODING_ENABLED = false;
}
MRJobConfig

http://hadoop.apache.org/docs/current/hadoop-mapreduce-client/hadoop-mapreduce-client-core/apidocs/org/apache/hadoop/mapreduce/Mapper.Context.html?is-external=true

 

静态类实现Reducer类

@Checkpointable
@Public
@Stable
public class Reducer<KEYIN, VALUEIN, KEYOUT, VALUEOUT> {
    public Reducer() {
    }

    protected void setup(Reducer<KEYIN, VALUEIN, KEYOUT, VALUEOUT>.Context context) throws IOException, InterruptedException {
    }

    protected void reduce(KEYIN key, Iterable<VALUEIN> values, Reducer<KEYIN, VALUEIN, KEYOUT, VALUEOUT>.Context context) throws IOException, InterruptedException {
        Iterator var4 = values.iterator();

        while(var4.hasNext()) {
            VALUEIN value = var4.next();
            context.write(key, value);
        }

    }

    protected void cleanup(Reducer<KEYIN, VALUEIN, KEYOUT, VALUEOUT>.Context context) throws IOException, InterruptedException {
    }

    public void run(Reducer<KEYIN, VALUEIN, KEYOUT, VALUEOUT>.Context context) throws IOException, InterruptedException {
        this.setup(context);

        try {
            while(context.nextKey()) {
                this.reduce(context.getCurrentKey(), context.getValues(), context);
                Iterator<VALUEIN> iter = context.getValues().iterator();
                if (iter instanceof ValueIterator) {
                    ((ValueIterator)iter).resetBackupStore();
                }
            }
        } finally {
            this.cleanup(context);
        }

    }

    public abstract class Context implements ReduceContext<KEYIN, VALUEIN, KEYOUT, VALUEOUT> {
        public Context() {
        }
    }
}

 

@Public
@Evolving
public interface ReduceContext<KEYIN, VALUEIN, KEYOUT, VALUEOUT> extends TaskInputOutputContext<KEYIN, VALUEIN, KEYOUT, VALUEOUT> {
    boolean nextKey() throws IOException, InterruptedException;

    Iterable<VALUEIN> getValues() throws IOException, InterruptedException;

    public interface ValueIterator<VALUEIN> extends MarkableIteratorInterface<VALUEIN> {
        void resetBackupStore() throws IOException;
    }
}

后面和Mapper中context一样继承了

常用

Mapper<LongWritable, Text, Text, IntWritable>
Reducer<Text, IntWritable, Text, IntWritable>

Writable类

LongWritable, Text, IntWritable 等类都属于 org.apache.hadoop.io 包 http://hadoop.apache.org/docs/current/api/

目前Java基本类型对应的Writable封装如下表所示。所有这些Writable类都继承自WritableComparable。也就是说,它们是可比较的。同时,它们都有get()和set()方法,用于获得和设置封装的值。

Java基本类型Writable实现序列化大小(字节)
boolean BooleanWritable 1
byte ByteWritable 1
Short ShortWritable 2
int IntWritable

VintWritable
4

1~5
float FloatWritable 4
long LongWritable

VlongWritable
8

1~9
double DoubleWritable 8

Text是针对UTF-8序列的Writable类。一般可以认为他是java.lang.String的Writable等价。Text类使用整型(通过变长编码的方式)来存储字符串编码中所需的字节数,因此该最大值为2GB。另外,Text使用标准UTF-8编码,这使得能够更简便的与其他理解UTF-8编码的工具进行交互操作。

 

Job要设置的内容

job = Job.getInstance(conf);

job.setJarByClass(WordCountMRJob.class);

// 设置mapper执行阶段
job.setMapperClass(WordCountMapper.class);
job.setMapOutputKeyClass(Text.class);//map输出key类型
job.setMapOutputValueClass(IntWritable.class); //map输出value类型

// 设置reduce执行阶段
job.setReducerClass(WordCountReducer.class);
job.setOutputKeyClass(Text.class);//reduce输出key类型
job.setOutputValueClass(IntWritable.class);//reduce输出value类型

还要有输入和输出路径

FileInputFormat.addInputPath(job,path);

FileOutputFormat.setOutputPath(job,output);

 

 

 

二、热身-WordCount

新建Maven项目

根据zookeeper和hadoop版本配置pom.xml,可用使用 echo stat|nc localhost 2181 查看zookeeper版本

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.aidata</groupId>
    <artifactId>bigdata</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <hadoop-version>3.0.0</hadoop-version>
        <zookeeper-version>3.4.5</zookeeper-version>
    </properties>
    <dependencies>
        <dependency>
            <groupId>org.apache.zookeeper</groupId>
            <artifactId>zookeeper</artifactId>
            <version>${zookeeper-version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop-version}</version>
        </dependency>

    </dependencies>

    <build>
        <plugins>
            <plugin>
                <artifactId>maven-assembly-plugin</artifactId>
                <version>2.3</version>
                <configuration>
                    <classifier>dist</classifier>
                    <appendAssemblyId>true</appendAssemblyId>
                    <descriptorRefs>
                        <descriptor>jar-with-dependencies</descriptor>
                    </descriptorRefs>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>

            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.6.2</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                    <encoding>UTF-8</encoding>
                </configuration>
            </plugin>
        </plugins>
    </build>

</project>

三个有单词文件上传HDFS,使用tab分隔

hdfs dfs -put wc_tes* /input/wc

编写MapReduce程序

package com.aidata.mapreduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class WordCountMRJob {

    //Map阶段

    /**
     * 输入数据键值对类型:
     * LongWritable:输入数据的偏移量
     * Text:输入数据类型
     *
     * 输出数据键值对类型:
     * Text:输出数据key的类型
     * IntWritable:输出数据value的类型
     */
    public static class WordCountMapper extends Mapper<LongWritable,Text, Text, IntWritable>{

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();

            String[] words = line.split("\t");

            for(String word : words){
                //word 1
                context.write(new Text(word),new IntWritable(1));
            }
        }
    }
    //Reduce阶段

    /**
     * 输入数据键值对类型:
     * Text:输入数据的key类型
     * IntWritable:输入数据的key类型
     *
     * 输出数据键值对类型:
     * Text:输出数据的key类型
     * IntWritable:输出数据的key类型
     */
    public static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            // word {1,1,1,...}

            int sum = 0;

            for(IntWritable value : values){
                sum += value.get();
            }

            context.write(key,new IntWritable(sum));
        }
    }

    public static void main(String[] args) {
        //1.配置job
        Configuration conf = new Configuration();
        Job job = null;

        //2.创建job
        try {
            job = Job.getInstance(conf);
        } catch (IOException e) {
            e.printStackTrace();
        }
        job.setJarByClass(WordCountMRJob.class);

        //3.给job添加执行流程

        //3.1 HDFS中需要处理的文件路径
        Path path = new Path(args[0]);

        try {
            //job添加输入路径
            FileInputFormat.addInputPath(job,path);
        } catch (IOException e) {
            e.printStackTrace();
        }

        //3.2设置map执行阶段
        job.setMapperClass(WordCountMapper.class);
        job.setMapOutputKeyClass(Text.class);//map输出key类型
        job.setMapOutputValueClass(IntWritable.class); //map输出value类型

        //3.3设置reduce执行阶段
        job.setReducerClass(WordCountReducer.class);
        job.setOutputKeyClass(Text.class);//reduce输出key类型
        job.setOutputValueClass(IntWritable.class);//reduce输出value类型

        //3.4设置job计算结果输出路径
        Path output = new Path(args[1]);
        FileOutputFormat.setOutputPath(job,output);

        //4. 提交job,并等待job执行完成
        try {
            boolean result = job.waitForCompletion(true);
            System.exit(result ? 0 : 1);
        } catch (IOException e) {
            e.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        }
    }
}

点击maven的package进行打包,jar包会在target目录中,如果idea没有target

 

 上传jar包到集群,运行

 hadoop jar bigdata-1.0-SNAPSHOT.jar com.aidata.mapreduce.WordCountMRJob /input/wc/ /output/wc

查看节点的输出结果

hdfs dfs -ls /output/wc

如果你使用的是LZO

比如CDH中安装了LZO,想使用下

安装lzop

yum install lzop

拷贝jar包到本地,本人使用的CDH6.3.1的,LZOjar包如下

/opt/cloudera/parcels/GPLEXTRAS-6.3.1-1.gplextras6.3.1.p0.1470567/lib/hadoop/lib/hadoop-lzo-0.4.15-cdh6.3.1.jar

三个制表符分隔单词的文件

压缩文件

lzop -v wc*.txt

上传到hdfs

hdfs dfs -put wc*.txt.lzo /input

建立索引

lzo压缩文件的可切片特性依赖于其索引,故我们需要手动为lzo压缩文件创建索引。若无索引,则lzo文件的切片只有一个。

hadoop jar /opt/cloudera/parcels/GPLEXTRAS-6.3.1-1.gplextras6.3.1.p0.1470567/lib/hadoop/lib/hadoop-lzo-0.4.15-cdh6.3.1.jar com.hadoop.compression.lzo.DistributedLzoIndexer /input/

 

将LZOjar包放到idea的resources目录中,点击 add library

第三方包,需要在maven中配置一下,否正maven不识别

maven打包过程用的是maven-compiler-plugin插件进行编译,但是由于项目中存在第三方jar包,maven-compiler-plugin无法获知第三方jar包的位置,因此报错“程序包xxx不存在”,解决方法:

            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.6.2</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                    <compilerArguments>
                        <extdirs>${project.basedir}/src/main/resources</extdirs>
                    </compilerArguments>
                    <encoding>UTF-8</encoding>
                </configuration>
            </plugin>

mapreduce程序修改一下

package com.aidata.mapreduce;

import com.hadoop.compression.lzo.LzopCodec;
import com.hadoop.mapreduce.LzoTextInputFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class WordCountMRJob {

    //Map阶段

    public static class WordCountMapper extends Mapper<LongWritable,Text, Text, IntWritable>{

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();

            String[] words = line.split("\t");

            for(String word : words){
                //word 1
                context.write(new Text(word),new IntWritable(1));
            }
        }
    }
    //Reduce阶段


    public static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            // word {1,1,1,...}

            int sum = 0;

            for(IntWritable value : values){
                sum += value.get();
            }

            context.write(key,new IntWritable(sum));
        }
    }

    public static void main(String[] args) {
        //1.配置job
        Configuration conf = new Configuration();
        Job job = null;

        //2.创建job
        try {
            job = Job.getInstance(conf);
        } catch (IOException e) {
            e.printStackTrace();
        }
        job.setJarByClass(WordCountMRJob.class);
        job.setInputFormatClass(LzoTextInputFormat.class);
        //配置reduce结果压缩以及压缩格式
        FileOutputFormat.setCompressOutput(job, true);
        FileOutputFormat.setOutputCompressorClass(job, LzopCodec.class);

        //3.给job添加执行流程

        //3.1 HDFS中需要处理的文件路径
        Path path = new Path(args[0]);

        try {
            //job添加输入路径
            FileInputFormat.addInputPath(job,path);
        } catch (IOException e) {
            e.printStackTrace();
        }

        //3.2设置map执行阶段
        job.setMapperClass(WordCountMapper.class);
        job.setMapOutputKeyClass(Text.class);//map输出key类型
        job.setMapOutputValueClass(IntWritable.class); //map输出value类型

        //3.3设置reduce执行阶段
        job.setReducerClass(WordCountReducer.class);
        job.setOutputKeyClass(Text.class);//reduce输出key类型
        job.setOutputValueClass(IntWritable.class);//reduce输出value类型

        //3.4设置job计算结果输出路径
        Path output = new Path(args[1]);
        FileOutputFormat.setOutputPath(job,output);

        //4. 提交job,并等待job执行完成
        try {
            boolean result = job.waitForCompletion(true);
            System.exit(result ? 0 : 1);
        } catch (IOException e) {
            e.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        }
    }
}

运行程序

hadoop jar bigdata-1.0-SNAPSHOT.jar com.aidata.mapreduce.WordCountMRJob /input/ /output

如果没有

FileOutputFormat.setCompressOutput(job, true);
FileOutputFormat.setOutputCompressorClass(job, LzopCodec.class);

则要指定输出格式

如若未在程序中配置输入和输出都为Lzo格式,可以在命令行通过 -D 开头的参数进行配置

hadoop jar myjar.jar \
 -D mapred.reduce.tasks=2 \
 -D mapreduce.job.inputformat.class=com.hadoop.mapreduce.LzoTextInputFormat \
 -D mapred.output.compress=true \
 -D mapred.output.compression.codec=com.hadoop.compression.lzo.LzopCodec \
 /input /output

 

CDH中reduce task数量的设置

MapReduce工具类 

mapreduce2.0以后的版本为了规范开发,为我们提供了新的辅助工具类

需要实现Tool接口

@Public
@Stable
public interface Tool extends Configurable {
    int run(String[] var1) throws Exception;
}

Tool接口继承了Configurable

@Public
@Stable
public interface Configurable {
    void setConf(Configuration var1);

    Configuration getConf();
}

为了更方便,提供了Configured方法,该方法实现了Configurable接口的方法,因此继承该方法就不用我们自己再实现Configurable接口的方法了

@Public
@Stable
public class Configured implements Configurable {
    private Configuration conf;

    public Configured() {
        this((Configuration)null);
    }

    public Configured(Configuration conf) {
        this.setConf(conf);
    }

    public void setConf(Configuration conf) {
        this.conf = conf;
    }

    public Configuration getConf() {
        return this.conf;
    }
}

我们只需实现run()方法即可,已经提供了ToolRunner工具类为我们调用run()方法

@Public
@Stable
public class ToolRunner {
    public ToolRunner() {
    }

    public static int run(Configuration conf, Tool tool, String[] args) throws Exception {
        if (CallerContext.getCurrent() == null) {
            CallerContext ctx = (new Builder("CLI")).build();
            CallerContext.setCurrent(ctx);
        }

        if (conf == null) {
            conf = new Configuration();
        }

        GenericOptionsParser parser = new GenericOptionsParser(conf, args);
        tool.setConf(conf);
        String[] toolArgs = parser.getRemainingArgs();
        return tool.run(toolArgs);
    }

    public static int run(Tool tool, String[] args) throws Exception {
        return run(tool.getConf(), tool, args);
    }

    public static void printGenericCommandUsage(PrintStream out) {
        GenericOptionsParser.printGenericCommandUsage(out);
    }

    public static boolean confirmPrompt(String prompt) throws IOException {
        while(true) {
            System.err.print(prompt + " (Y or N) ");
            StringBuilder responseBuilder = new StringBuilder();

            while(true) {
                int c = System.in.read();
                if (c == -1 || c == 13 || c == 10) {
                    String response = responseBuilder.toString();
                    if (!response.equalsIgnoreCase("y") && !response.equalsIgnoreCase("yes")) {
                        if (!response.equalsIgnoreCase("n") && !response.equalsIgnoreCase("no")) {
                            System.err.println("Invalid input: " + response);
                            break;
                        }

                        return false;
                    }

                    return true;
                }

                responseBuilder.append((char)c);
            }
        }
    }
}

 来看一下ToolRunner的run()方法

public static int run(Configuration conf, Tool tool, String[] args) throws Exception {
        if (CallerContext.getCurrent() == null) {
            CallerContext ctx = (new Builder("CLI")).build();
            CallerContext.setCurrent(ctx);
        }

        if (conf == null) {
            conf = new Configuration();
        }

        GenericOptionsParser parser = new GenericOptionsParser(conf, args);
        tool.setConf(conf);
        String[] toolArgs = parser.getRemainingArgs();
        return tool.run(toolArgs);
    }

第一个参数是个Configuration,和实现了Tool接口覆写的run()方法的Configuration不是一个,这个专门用来存放参数的

如果conf是空,重新创建一个Configuration,所以传null也是可以的

解析提交job的时候外部的参数

GenericOptionsParser parser = new GenericOptionsParser(conf, args);

和job相关的会被加入到conf中

tool.setConf(conf);

这个tool中setConf是实现了Tool接口的对象的一个conf,也就是把这里的接收的和job有关的参数合并到那个总的conf中去了

和job不相关的则会被传入到run方法中

String[] toolArgs = parser.getRemainingArgs();
return tool.run(toolArgs);

外部可以传入的参数也是mapred-default.xml 的设置项,具体可用的参数见 https://hadoop.apache.org/docs/stable/hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml

比如我们想设置reduceTask的数量,-D后面加设置项即可

hadoop jar xxx.jar 主类 input路径 output路径 -Dmapreduce.job.reduces=3 

前面加-D的参数会被设置到conf中去,没有加的比如输入和输出路径会传给tool.run()

public class WordCount extends Configured implements Tool {

    public static class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            String[] words = line.split("\t");
            for (String word: words) {
                context.write(new Text(word), new IntWritable(1));
            }
        }
    }

    public static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable value: values){
                sum = sum + value.get();
            }
            context.write(key, new IntWritable(sum));
        }
    }

    @Override
    public int run(String[] args) throws Exception {
        Configuration conf = this.getConf();
        Job job = null;
        try {
            job = Job.getInstance(conf);
        } catch (IOException e) {
            e.printStackTrace();
        }
        job.setJarByClass(WordCount.class);

        job.setMapperClass(WordCountMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        job.setReducerClass(WordCountReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        Path path = new Path(args[0]);
        FileInputFormat.addInputPath(job, path);
        Path out = new Path(args[1]);
        FileOutputFormat.setOutputPath(job, out);

        boolean result = job.waitForCompletion(true);
        return result? 0: 1;
    }

    public static void main(String[] args) {

        // 用于本地测试
        if (args.length == 0){
            args = new String[]{
                    "hdfs://ns/input/wc/",
                    "hdfs://ns/output/wc"
            };
        }
        // 配置job
        Configuration conf = new Configuration();
        Path hdfsOutPutPath = new Path(args[1]);
        try {
            FileSystem fileSystem = FileSystem.get(conf);
            if (fileSystem.exists(hdfsOutPutPath)){
                fileSystem.delete(hdfsOutPutPath, true);
            }
        }catch (Exception e){
            e.printStackTrace();
        }
        try {
            int stat = ToolRunner.run(null, new WordCount(), args);
            System.exit(stat);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

下面这句

int stat = ToolRunner.run(null, new WordCount(), args);

接收的数值实质上是来自

boolean result = job.waitForCompletion(true);
return result? 0: 1;

也就是job完成返回0,否正返回1

因为下面使用 System.exit(stat) 来退出整个程序,System.exit(0)是正常退出程序,而System.exit(1)或者说非0表示非正常退出程序。

 

三、网站日志分析项目

 

Java基本类型Writable实现序列化大小(字节)booleanBooleanWritable1byteByteWritable1ShortShortWritable2intIntWritable

VintWritable4

1~5floatFloatWritable4longLongWritable

VlongWritable8

1~9doubleDoubleWritable8

posted on 2020-03-10 23:44  AI数据  阅读(536)  评论(0编辑  收藏  举报

导航