首页 > 编程语言 > 详细

大文件中数据排序问题

时间:2021-09-17 15:53:00      阅读:25      评论:0      收藏:0      [点我收藏+]

一个大文件,包含很多行,每一行都是int类型的数据,按照从小到大的顺序进行排序

package com.example.test;


import java.io.*;
import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.Random;

/**
 * 大数据排序 合并
 */
public class BigFileSort {
    /**
     * 大数据文件路径
     */
    private static final String SOURCE_FILE_PATH = "/Users/enjoy/Documents/test";
    /**
     * 中间临时小文件的路径
     */
    private static final String TEMP_FILE_PATH = "/Users/enjoy/Documents/test/temp";
    /**
     * 大数据文件名称
     */
    private static final String SOURCE_FILE_NAME = "data";
    /**
     * 生成的目标文件名
     */
    private static final String SORTED_FILE_NAME = "sorted";
    /**
     * 临时小文件前缀
     */
    private static final String TEMP_FILE_NAME_PREFIX = "temp-";
    /**
     * 文件后缀
     */
    private static final String FILE_SUFFIX = ".txt";
    /**
     * 生成源文件 行数
     */
    private static final int SOURCE_DATA_COUNT = 1000000;
    /**
     * 临时文件数量
     */
    private static final int TEMP_FILE_COUNT = 10;

    public static void main(String[] args) throws IOException {

        long startNumber = System.currentTimeMillis();
        String sourceFileName = SOURCE_FILE_PATH + "/" + SOURCE_FILE_NAME + FILE_SUFFIX;
        // 生成测试数据
        mockBigDataFile(sourceFileName, SOURCE_DATA_COUNT);
        System.out.println("存储完毕");
        // 将大数据文件分割到若干个小文件中
        splitBigFile(SOURCE_FILE_PATH, TEMP_FILE_PATH, TEMP_FILE_COUNT);
        System.out.println("文件切割完毕!");
        // 把每个文件的数据进行排序
        sortTempFile(TEMP_FILE_PATH, TEMP_FILE_COUNT);
        System.out.println("每个子文件排序完毕!");
        // 排序后的多个文件数据进行整合
        mergeTempSortedFile(SOURCE_FILE_PATH, TEMP_FILE_PATH, TEMP_FILE_COUNT);
        System.out.println("整合完毕");
        long stopNumber = System.currentTimeMillis();
        System.out.println("耗时" + (stopNumber - startNumber) + "毫秒");
    }

    public static void mockBigDataFile(String fileName, int count) throws IOException {
        makeSureFileExists(SOURCE_FILE_PATH, fileName);
        FileWriter fs = new FileWriter(fileName);
        BufferedWriter fw = new BufferedWriter(fs);
        for (int i = 0; i < count; i++) {
            fw.write(new Random().nextInt(SOURCE_DATA_COUNT) + "\r\n");
        }
        fw.close();
        fs.close();
    }

    public static void makeSureFileExists(String filePath, String fileName) throws IOException {
        File path = new File(filePath);
        if (!path.exists()) {
            path.mkdirs();
        }
        File file = new File(fileName);
        if (!file.exists()) {
            file.createNewFile();
        }
    }

    // 将大数据文件切分到多个小文件中
    public static void splitBigFile(String sourceFilePath, String tempFilePath,
                                    int fileCount) throws IOException {
        FileReader fr = new FileReader(sourceFilePath + "/" + SOURCE_FILE_NAME + FILE_SUFFIX);
        BufferedReader br = new BufferedReader(fr); // 读取获取整行数据
        LinkedList<FileWriter> sourceFWList = new LinkedList<>();    //初始化文件流对象集合
        LinkedList<BufferedWriter> sourceBWList = new LinkedList<>();
        for (int j = 1; j <= fileCount; j++) {
            String fileName = tempFilePath + "/" + TEMP_FILE_NAME_PREFIX + j + FILE_SUFFIX;
            makeSureFileExists(tempFilePath, fileName);
            //声明对象
            FileWriter sourceFW = new FileWriter(fileName, false);
            BufferedWriter sourceBW = new BufferedWriter(sourceFW);
            //将对象装入集合
            sourceFWList.add(sourceFW);
            sourceBWList.add(sourceBW);
        }
        int i = 1;
        while (br.ready()) {
            int count = 1; // 从第一行开始写
            for (BufferedWriter type : sourceBWList) {
                if (i == count) {
                    type.write(br.readLine() + "\r\n");
                    break;
                }
                // 第一行写完,写第二行
                count++;
            }
            // 一个文件写完一行之后,切换下一个文件
            if (i >= fileCount) {
                i = 1;
            } else {
                i++;
            }
        }
        br.close();
        fr.close();
        for (BufferedWriter object : sourceBWList) {
            object.close();
        }
        for (FileWriter object : sourceFWList) {
            object.close();
        }
    }

    // 把每个小文件中的数据进行排序
    public static void sortTempFile(String filePath, int fileCount) {
        LinkedList<Integer> nums;
        for (int i = 1; i <= fileCount; i++) {
            nums = new LinkedList<>();
            String path = filePath + "/" + TEMP_FILE_NAME_PREFIX + i + FILE_SUFFIX;
            try (FileReader fr = new FileReader(path);
                 BufferedReader br = new BufferedReader(fr)) {
                while (br.ready()) {
                    // 将读取的单个数据加入到集合里面
                    nums.add(Integer.valueOf(br.readLine()));
                }
                // 对集合进行排序
                Collections.sort(nums);
                // 将排序好的数据写入源文件
                sortedToFile(nums, path);
            } catch (NumberFormatException | IOException e) {
                e.printStackTrace();
            }
        }
    }

    // 对每个文件数据进行排序,写入文件
    public static void sortedToFile(LinkedList<Integer> list, String path) {
        try (FileWriter fs = new FileWriter(path);
             BufferedWriter fw = new BufferedWriter(fs)) {
            for (Integer str : list) {
                fw.write(str + "\r\n");
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    // 合并排序后的文件
    public static void mergeTempSortedFile(String filepath, String splitFilePath, int fileCount) throws IOException {
        LinkedList<ReadNode> readOneLineList = new LinkedList<>();
        int hasNoDataCount = 0;
        FileWriter sortedFW = new FileWriter(filepath + "/" + SORTED_FILE_NAME + FILE_SUFFIX, false);           //创建文件流,以便整合的数据写入
        BufferedWriter sortedBW = new BufferedWriter(sortedFW);
        LinkedList<BufferedReader> tempFileBR = new LinkedList<>();
        LinkedList<FileReader> tempFileFR = new LinkedList<>();
        for (int j = 1; j <= fileCount; j++) {
            FileReader fr = new FileReader(splitFilePath + "/" + TEMP_FILE_NAME_PREFIX + j + FILE_SUFFIX);
            BufferedReader br = new BufferedReader(fr);
            tempFileFR.add(fr);
            tempFileBR.add(br);
        }
        for (BufferedReader br : tempFileBR) {
            if (br.ready()) {
                readOneLineList.add(new ReadNode(Integer.valueOf(br.readLine()), br));
                continue;
            }
            if (!br.ready()) {
                hasNoDataCount++;
            }
        }
        readOneLineList.sort(Comparator.comparingInt(o -> o.value));
        for (; ; ) {
            if (hasNoDataCount == fileCount) {
                break;
            }
            ReadNode rn = readOneLineList.get(0);
            sortedBW.write(rn.value + "\r\n");
            if (!rn.getBr().ready()) {
                hasNoDataCount++;
                readOneLineList.remove(0);
                continue;
            }
            if (rn.getBr().ready()) {
                rn.setValue(Integer.valueOf(rn.getBr().readLine()));
                readOneLineList.sort(Comparator.comparingInt(o -> o.value));
            }
        }
        sortedBW.close();
        sortedFW.close();
        for (BufferedReader object2 : tempFileBR) {
            object2.close();
        }
        for (FileReader object : tempFileFR) {
            object.close();
        }
    }
}

class ReadNode {
    Integer value;
    BufferedReader br;

    public ReadNode(Integer value, BufferedReader br) {
        this.value = value;
        this.br = br;
    }

    public void setValue(Integer value) {
        this.value = value;
    }

    public BufferedReader getBr() {
        return br;
    }
    
}

大文件中数据排序问题

原文:https://www.cnblogs.com/onlyrun/p/15302963.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!