What is Apache HBase?
Apache HBase is a distributed NoSQL database providing real-time random read/write access to big data with automatic scaling, fault tolerance, and strong consistency guarantees. Built on top of Hadoop HDFS, it's designed to handle billions of rows and millions of columns with low-latency access patterns.
HBase follows Google's BigTable design, offering column-family storage with automatic sharding, real-time queries, and horizontal scalability. It's ideal for applications requiring fast, random access to large datasets like time-series data, user profiles, and content management.
HBase Performance & Sizing Calculator
Performance Metrics
Real-World Examples
Social graph and messaging infrastructure
- • 100+ PB of data across multiple clusters
- • 1+ billion users' social data
- • 10+ million queries per second
- • 99.99% uptime for messaging
Adobe
Customer analytics and behavioral tracking
- • 50+ PB customer interaction data
- • 5000+ nodes across clusters
- • 1+ trillion events processed monthly
- • Sub-second analytics queries
Salesforce
CRM platform and customer data management
- • 20+ PB customer relationship data
- • 150+ million CRM users supported
- • 1+ billion API calls daily
- • Multi-tenant architecture at scale
Content discovery and user engagement
- • 10+ PB pin and board data
- • 400+ million active users
- • 100+ billion pins indexed
- • Real-time recommendation engine
Best Practices
Do's
- ✓ Design row keys to distribute data evenly (avoid hotspotting)
- ✓ Use column families for similar access patterns
- ✓ Enable compression (Snappy/LZ4) for storage efficiency
- ✓ Configure Bloom filters for read-heavy workloads
- ✓ Use batch operations for bulk writes
- ✓ Monitor region split patterns and pre-split large tables
- ✓ Implement proper security with Kerberos authentication
- ✓ Use connection pooling for high-concurrency applications
Don'ts
- ✗ Don't use sequential or timestamp-based row keys (causes hotspotting)
- ✗ Don't create too many column families (impacts performance)
- ✗ Don't ignore MemStore and BlockCache tuning
- ✗ Don't disable WAL without understanding durability trade-offs
- ✗ Don't store large binary objects without considering alternatives
- ✗ Don't neglect compaction monitoring and tuning
- ✗ Don't run production without proper monitoring and alerting
- ✗ Don't assume ACID transactions across multiple rows
Core Concepts Deep Dive
Column-oriented NoSQL database with a sparse, distributed, persistent multidimensional sorted map data model optimized for real-time random read/write access.
Key Points:
- Four-dimensional data model: Row Key, Column Family, Column Qualifier, Timestamp
- Column families group related columns with similar access patterns
- Sparse data storage - only stores non-null values to save space
- Automatic versioning with configurable timestamp-based versions
- Lexicographic row key ordering for range scans and locality
- Schema-less design within column families for flexibility
Implementation Example:
# HBase Table Definition
# Create table with column families
create 'users', {NAME => 'personal', VERSIONS => 3}, {NAME => 'activity', TTL => 86400}
# HBase Shell Operations
# Put data with explicit timestamps
put 'users', 'user123', 'personal:name', 'John Doe'
put 'users', 'user123', 'personal:email', 'john@example.com'
put 'users', 'user123', 'activity:last_login', '2024-01-15 10:30:00'
put 'users', 'user123', 'activity:page_views', '1250'
# Scan with filters
scan 'users', {COLUMNS => ['personal:'], LIMIT => 10}
get 'users', 'user123', {COLUMN => 'personal:name', VERSIONS => 3}
# Java HBase Client API
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;
public class HBaseExample {
private Connection connection;
private Admin admin;
public void initializeConnection() throws Exception {
Configuration config = HBaseConfiguration.create();
config.set("hbase.zookeeper.quorum", "localhost");
config.set("hbase.zookeeper.property.clientPort", "2181");
connection = ConnectionFactory.createConnection(config);
admin = connection.getAdmin();
}
public void createTable(String tableName, String[] columnFamilies) throws Exception {
TableName table = TableName.valueOf(tableName);
if (admin.tableExists(table)) {
System.out.println("Table already exists");
return;
}
HTableDescriptor tableDescriptor = new HTableDescriptor(table);
for (String cf : columnFamilies) {
HColumnDescriptor columnDescriptor = new HColumnDescriptor(cf);
columnDescriptor.setMaxVersions(3);
columnDescriptor.setTimeToLive(86400); // 1 day TTL
columnDescriptor.setCompressionType(Compression.Algorithm.SNAPPY);
tableDescriptor.addFamily(columnDescriptor);
}
admin.createTable(tableDescriptor);
System.out.println("Table created successfully");
}
public void putData(String tableName, String rowKey, String columnFamily,
String qualifier, String value) throws Exception {
Table table = connection.getTable(TableName.valueOf(tableName));
Put put = new Put(Bytes.toBytes(rowKey));
put.addColumn(Bytes.toBytes(columnFamily),
Bytes.toBytes(qualifier),
Bytes.toBytes(value));
table.put(put);
table.close();
}
public void getData(String tableName, String rowKey) throws Exception {
Table table = connection.getTable(TableName.valueOf(tableName));
Get get = new Get(Bytes.toBytes(rowKey));
Result result = table.get(get);
for (Cell cell : result.rawCells()) {
System.out.printf("Row: %s, CF: %s, Qualifier: %s, Value: %s, Timestamp: %d%n",
Bytes.toString(CellUtil.cloneRow(cell)),
Bytes.toString(CellUtil.cloneFamily(cell)),
Bytes.toString(CellUtil.cloneQualifier(cell)),
Bytes.toString(CellUtil.cloneValue(cell)),
cell.getTimestamp());
}
table.close();
}
public void scanTable(String tableName, String startRow, String stopRow) throws Exception {
Table table = connection.getTable(TableName.valueOf(tableName));
Scan scan = new Scan();
scan.setStartRow(Bytes.toBytes(startRow));
scan.setStopRow(Bytes.toBytes(stopRow));
scan.setCaching(100); // Fetch 100 rows at a time
ResultScanner scanner = table.getScanner(scan);
for (Result result : scanner) {
System.out.println("Row: " + Bytes.toString(result.getRow()));
for (Cell cell : result.rawCells()) {
System.out.printf(" %s:%s = %s%n",
Bytes.toString(CellUtil.cloneFamily(cell)),
Bytes.toString(CellUtil.cloneQualifier(cell)),
Bytes.toString(CellUtil.cloneValue(cell)));
}
}
scanner.close();
table.close();
}
}
# Python HBase Client (using happybase)
import happybase
from datetime import datetime
class HBaseClient:
def __init__(self, host='localhost', port=9090):
self.connection = happybase.Connection(host, port)
def create_table(self, table_name, column_families):
"""Create HBase table with column families"""
families = {}
for cf_name, cf_config in column_families.items():
families[cf_name] = dict(
max_versions=cf_config.get('versions', 1),
time_to_live=cf_config.get('ttl', None),
compression='snappy'
)
self.connection.create_table(table_name, families)
print(f"Table '{table_name}' created successfully")
def put_data(self, table_name, row_key, data):
"""Insert data into HBase table"""
table = self.connection.table(table_name)
table.put(row_key.encode(), data)
def get_data(self, table_name, row_key, columns=None):
"""Retrieve data from HBase table"""
table = self.connection.table(table_name)
row = table.row(row_key.encode(), columns=columns)
result = {}
for key, value in row.items():
cf_qualifier = key.decode().split(':')
if len(cf_qualifier) == 2:
cf, qualifier = cf_qualifier
if cf not in result:
result[cf] = {}
result[cf][qualifier] = value.decode()
return result
def scan_table(self, table_name, row_start=None, row_stop=None, columns=None):
"""Scan HBase table with optional filters"""
table = self.connection.table(table_name)
for key, data in table.scan(row_start=row_start, row_stop=row_stop,
columns=columns):
yield key.decode(), {k.decode(): v.decode() for k, v in data.items()}
# Usage Example
hbase = HBaseClient()
# Create table
column_families = {
'personal': {'versions': 3, 'ttl': None},
'activity': {'versions': 1, 'ttl': 86400} # 1 day TTL
}
hbase.create_table('users', column_families)
# Insert data
user_data = {
b'personal:name': b'Alice Johnson',
b'personal:email': b'alice@example.com',
b'personal:age': b'28',
b'activity:last_login': b'2024-01-15T10:30:00',
b'activity:page_views': b'1250'
}
hbase.put_data('users', 'user456', user_data)
# Retrieve data
user = hbase.get_data('users', 'user456')
print(f"User data: {user}")
# Scan table
for row_key, data in hbase.scan_table('users', columns=[b'personal:']):
print(f"Row: {row_key}, Data: {data}")