hbase 多线程 大数据量入库

本文介绍了一种利用线程池进行HBase数据库批量插入的方法,通过实例展示了如何在HBase中高效地插入大量数据。同时,文章还详细介绍了HDFS和HBase的高可用配置,包括HDFS的复制因子、名称节点配置、HBase的数据存储路径及ZooKeeper配置。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

1.插入数据库

@Test
public void poolinserts(){
    List<PoolInsert> poolInserts = new ArrayList<PoolInsert>();
    final Connection conn  =HbaseUtils.conn();
    final String tableName = "ns1:t2";
    ThreadPoolExecutor pool = ThreadPool.getPool();
    ExecutorService fixPool = ThreadPool.getFixPool();
    poolInserts.add(new PoolInsert(0,300,tableName,conn));
    poolInserts.add(new PoolInsert(300,600,tableName,conn));
    poolInserts.add(new PoolInsert(600,900,tableName,conn));
    poolInserts.add(new PoolInsert(900,1000,tableName,conn));
    for (PoolInsert p:poolInserts){
        pool.execute(p);
    }
    while (!pool.isTerminated()){
        pool.shutdown();
    }
    System.out.println("成功");
}

2.hbase连接工具

public static Connection conn(){
    Configuration conf = HBaseConfiguration.create();
    try {
        return ConnectionFactory.createConnection(conf);
    }catch (IOException e){
        log.error("连接异常:{}",e);
    }
    return null;
}

3.1 hdfs-site.xml配置

 <configuration>
    <property>
        <name>dfs.replication</name>
        <value>3</value>
    </property>
    <property>
        <name>dfs.nameservices</name>
        <value>mycluster</value>
    </property>
    <property>
        <name>dfs.ha.namenodes.mycluster</name>
        <value>nn1,nn2</value>
    </property>
    <property>
        <name>dfs.namenode.rpc-address.mycluster.nn1</name>
        <value>s226:8020</value>
    </property>
    <property>
        <name>dfs.namenode.rpc-address.mycluster.nn2</name>
        <value>s229:8020</value>
    </property>
    <property>
        <name>dfs.namenode.http-address.mycluster.nn1</name>
        <value>s226:50070</value>
    </property>
    <property>
        <name>dfs.namenode.http-address.mycluster.nn2</name>
        <value>s229:50070</value>
    </property>
    <property>
        <name>dfs.namenode.shared.edits.dir</name>
        <value>qjournal://s227:8485;s228:8485;s229:8485/mycluster</value>
    </property>
    <property>
        <name>dfs.client.failover.proxy.provider.mycluster</name>
        <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
    </property>
    <property>
        <name>dfs.ha.fencing.methods</name>
        <value>
            sshfence
            shell(/bin/true)
        </value>
    </property>
    <property>
        <name>dfs.ha.fencing.ssh.private-key-files</name>
        <value>/home/centos/.ssh/id_rsa</value>
    </property>
    <property>
        <name>dfs.journalnode.edits.dir</name>
        <value>/home/centos/hadoop/journal</value>
    </property>
    <property>
        <name>dfs.ha.automatic-failover.enabled</name>
        <value>true</value>
        <description>
            Whether automatic failover is enabled. See the HDFS High
            Availability documentation for details on automatic HA
            configuration.
        </description>
    </property>

3.2 hbase-site.xml配置文件

<configuration>
    <!-- 使用完全分布式 -->
    <property>
        <name>hbase.cluster.distributed</name>
        <value>true</value>
    </property>

    <!-- 指定hbase数据在hdfs上的存放路径 -->
    <property>
        <name>hbase.rootdir</name>
        <value>hdfs://mycluster/hbase</value>
    </property>
    <!-- 配置zk地址 -->
    <property>
        <name>hbase.zookeeper.quorum</name>
        <value>s227:2181,s228:2181,s229:2181</value>
    </property>
    <!-- zk的本地目录 -->
    <property>
        <name>hbase.zookeeper.property.dataDir</name>
        <value>/home/centos/zookeeper</value>
    </property>
</configuration>

4.需要插入的对象封装

@Slf4j
@Data
public class PoolInsert implements Runnable{
	
	    private int start;
	    private int end;
	    private String tableName;
	    private Connection conn;
	
	    public PoolInsert(int start, int end, String tableName, Connection conn) {
	        this.start = start;
	        this.end = end;
	        this.tableName = tableName;
	        this.conn = conn;
	    }
	
	    @Override
	    public void run() {
	        System.out.println(1111);
	        //inset(start,end,tableName,conn);
	        try {
	            TableName tname = TableName.valueOf(tableName);
	            HTable table = (HTable) conn.getTable(tname);
	
	            DecimalFormat df = new DecimalFormat("0000");
	            table.setAutoFlush(false);
	            for (int j = start; j < end; j++) {
	                byte[] rowkey = Bytes.toBytes("row" + df.format(j));
	
	                Put put = new Put(rowkey);
	                put.addColumn(Bytes.toBytes("f1"), Bytes.toBytes("id"), Bytes.toBytes(end - 1));
	                put.addColumn(Bytes.toBytes("f1"), Bytes.toBytes("name"), Bytes.toBytes("name" + j));
	                table.put(put);
	                if (j % 2000 == 0) {
	                    table.flushCommits();
	                }
	            }
	            table.flushCommits();
	        }catch(Exception e){
	            log.error("插入异常:{}",e);
	        }
	        System.out.println(2222);
	    }
	    //百万插入
	    private String inset(int start, int end, String tableName, Connection conn ) {
	        try {
	            TableName tname = TableName.valueOf(tableName);
	            HTable table = (HTable) conn.getTable(tname);
	
	            DecimalFormat df = new DecimalFormat("0000");
	            table.setAutoFlush(false);
	            for (int j = start; j < end; j++) {
	                byte[] rowkey = Bytes.toBytes("row" + df.format(j));
	
	                Put put = new Put(rowkey);
	                put.addColumn(Bytes.toBytes("f1"), Bytes.toBytes("id"), Bytes.toBytes(end - 1));
	                put.addColumn(Bytes.toBytes("f1"), Bytes.toBytes("name"), Bytes.toBytes("name" + j));
	                table.put(put);
	                if (j % 2000 == 0) {
	                    table.flushCommits();
	                }
	            }
	            table.flushCommits();
	        }catch(Exception e){
	            log.error("插入异常:{}",e);
	        }
	        return "完成:"+(end-start);
	    }
	
	}
HBase中进行多线程批量数据写入可以提高写入效率。以下是一个简单的示例代码,演示了如何使用Java多线程进行批量数据写入: ```java import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.client.*; import org.apache.hadoop.hbase.util.Bytes; import java.util.ArrayList; import java.util.List; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; public class HBaseMultiThreadedWriter { private static final String TABLE_NAME = "your_table"; private static final String COLUMN_FAMILY = "cf"; private static final String COLUMN_QUALIFIER = "col"; public static void main(String[] args) { Configuration config = HBaseConfiguration.create(); config.set("hbase.zookeeper.quorum", "your_zookeeper_quorum"); try (Connection connection = ConnectionFactory.createConnection(config); Table table = connection.getTable(TableName.valueOf(TABLE_NAME))) { ExecutorService executorService = Executors.newFixedThreadPool(10); // 控制线程池大小 List<Runnable> tasks = new ArrayList<>(); // 创建100个写入任务 for (int i = 0; i < 100; i++) { final int index = i; Runnable task = () -> { try { // 构造Put对象 Put put = new Put(Bytes.toBytes("rowkey_" + index)); put.addColumn(Bytes.toBytes(COLUMN_FAMILY), Bytes.toBytes(COLUMN_QUALIFIER), Bytes.toBytes("value_" + index)); // 执行写入操作 table.put(put); } catch (Exception e) { e.printStackTrace(); } }; tasks.add(task); } // 提交任务给线程池执行 tasks.forEach(executorService::submit); // 关闭线程池 executorService.shutdown(); } catch (Exception e) { e.printStackTrace(); } } } ``` 在上述示例代码中,我们使用了Java的`ExecutorService`和`Runnable`接口来创建一个固定大小的线程池,并提交多个写入任务。每个任务都是独立的,负责向HBase写入一行数据。 通过使用多线程批量写入,可以并行地向HBase写入多个数据行,从而提高写入效率。请根据实际情况调整线程池大小和批量写入数据量。记得根据需要设置适当的HBase连接参数和表信息。 需要注意的是,多线程写入时可能会对HBase集群产生较大的负载,请确保集群的硬件资源和网络带宽足够支持高并发的写入操作。此外,还要考虑表的预分区策略、RegionServer的负载均衡等因素,以避免潜在的性能问题。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值