需求:
1、cockroachdb数据库中的表order_sku_manage存在几十亿数据,需要导出成csv文件,以便迁移到hadoop上
2、分多个文件存储,每个文件最大存放数据量:5亿
说明:
需要用到的模块:os、csv、psycopg2
# coding:utf-8 import os import csv import psycopg2 # 配置cockroach数据库链接参数 class CockDBConfig(object): host = "192.168.1.100" port = 3359 database = "test" user = "cock_ro" password = "abcd1234" conn = psycopg2.connect( port=CockDBConfig.port, host=CockDBConfig.host, database=CockDBConfig.database, user=CockDBConfig.user, password=CockDBConfig.password ) conn.set_session(autocommit=True) cursor = conn.cursor() # 定义存放路径,存放到py文件所在路径的result文件夹下 file_path = "{0}{1}result{1}".format(os.path.dirname(os.path.abspath(__file__)), os.sep) # 定义表名、主键列、比当前最小主键更小的id值、每次查询的返回的数据量 table_name = "order_sku_manage" key_column = "order_sku" min_id = "0000000000000000000" limit_num = 10000 # 定义每个文件存放多少行数据 batch_num = 500000000 file_num = 1 sql = "select * from {0} where {1} > ‘{2}‘ order by {1} limit {3}" sel_times = 1 while True: exec_sql = sql.format(table_name, key_column, min_id, limit_num) cursor.execute(exec_sql) rows = cursor.fetchall() # 如果查询返回的结果集为空,则退出 if not rows: break else: # 将每次查询结果集中的最大id值赋值给min_id,以便进行下一次查询 # 写入数据量超过定义文件的存储行数时,写入到新文件中 min_id = rows[-1][0] if sel_times*limit_num > batch_num: sel_times = 1 file_num += 1 data_file = "{}{}_data_{:0>2}.csv".format(file_path, table_name, file_num) with open(data_file, "a+", newline="", encoding="utf-8") as csv_file: w = csv.writer(csv_file) w.writerows(rows) sel_times += 1 cursor.close() conn.close()
原文:https://www.cnblogs.com/broadway/p/11597414.html