読者です 読者をやめる 読者になる 読者になる

BufferedReder#readLineを使わずに、ファイルを一行ずつ読み込む

Javaで、ファイルから一行ずつ読み込みを行いたい場合は、BufferedReder#readLine()を使うんだけど、このやり方だと改行コードが取り除かれてしまう。で、自前で改行コードを付与して処理を行うなり、ファイルに書きだすなりすることとなる。

でも、場合によっては改行コードをそのまま保持したいときもある。そこで、Reader#readを使ったやり方を考えてみた。

/**
 * Copyright (c) 2012 Jupitris on Labs.
 * 
 * Permission is hereby granted, free of charge, to any person
 * obtaining a copy of this software and associated documentation
 * files (the "Software"), to deal in the Software without
 * restriction, including without limitation the rights to use,
 * copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */
package com.atphis.jupitris.file;

import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;

/**
 * @author jupitris
 * 
 */
public class FileLineSeparator {

    /** carriage return(0x0D(13)). */
    private static final char CR = '\r';

    /** line feed code(0x0A(10)). */
    private static final char LF = '\n';

    /**
     * private constructor.
     */
    private FileLineSeparator() {
        throw new AssertionError();
    }

    /**
     * Returns the list of separated file by CR or LF.
     * 
     * @param reader
     *            the stream reader.
     * @return Returns the list of separated file.
     */
    public static List<String> readCharacters(Reader reader) {

        List<String> lines = new ArrayList<String>();
        StringBuilder builder = new StringBuilder();

        try {
            for (int c = reader.read(); c != -1; c = reader.read()) {
                builder.append((char) c);
                if (isCrLf(c)) {
                    int cc = reader.read();
                    if (!isCrLf(cc)) { // 次の文字が改行コード以外
                        lines.add(builder.toString());
                        builder = new StringBuilder();
                        builder.append((char) cc);
                    } else if (cc == c) { // 同一の改行コードが続いた場合
                        lines.add(builder.toString());
                        lines.add(String.valueOf((char) cc));
                        builder = new StringBuilder();
                    } else { // それ以外
                        builder.append((char) cc);
                        lines.add(builder.toString());
                        builder = new StringBuilder();
                    }
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        return lines;
    }

    /**
     * Returns the true if character code was CR or LF. otherwise false.
     * 
     * @param c
     *            character code.
     * @return true, if character code was CR or LF. otherwise false.
     */
    private static boolean isCrLf(int c) {
        return (c == CR || c == LF);
    }
}

あまりコードが綺麗でなくて申し訳ないところなのだが、要は1文字ずつファイルから読み出し、キャリッジリターン(CR)やラインフィード(LF)が出現したらひとつ先の文字を読みだして、それが通常の文字なのかCRやLFなのかを判定して1行分をListに保存している。

テストコードを用意したので、興味の有る方は実際に実行していただければ。
準備するファイルのパターンは、改行コードがそれぞれCR+LF、CR、LFのものと、何も書かれていない空のファイル。

/**
 * Copyright (c) 2012 Jupitris on Labs.
 * 
 * Permission is hereby granted, free of charge, to any person
 * obtaining a copy of this software and associated documentation
 * files (the "Software"), to deal in the Software without
 * restriction, including without limitation the rights to use,
 * copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */
package com.atphis.jupitris.file;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.util.List;

import org.junit.Assert;
import org.junit.Test;

/**
 * @author jupitris
 * 
 */
public class FileLineSeparatorTest {

    @Test
    public void testReadCharactersFromWin() throws FileNotFoundException {
        try {
            Reader reader =
                    new BufferedReader(new InputStreamReader(ClassLoader.getSystemResourceAsStream(
                            "data/testdata-dos.txt"), "EUC-JP"));
            List<String> actual = FileLineSeparator.readCharacters(reader);

            Assert.assertNotNull(actual);
            Assert.assertTrue(!actual.isEmpty());

            Assert.assertEquals(8, actual.size());

            Assert.assertEquals("新しい朝がきた。\r\n", actual.get(0));
            Assert.assertEquals("\r\n", actual.get(1));
            Assert.assertEquals("希望の朝だ。\r\n", actual.get(2));
            Assert.assertEquals("\r\n", actual.get(3));
            Assert.assertEquals("\r\n", actual.get(4));
            Assert.assertEquals("Hello! World!\r\n", actual.get(5));
            Assert.assertEquals("こんにちは!世界!\r\n", actual.get(6));
            Assert.assertEquals("\r\n", actual.get(7));
        } catch (UnsupportedEncodingException e) {
            Assert.fail();
        }
    }

    @Test
    public void testReadCharactersFromMac() throws FileNotFoundException {
        try {
            Reader reader =
                    new BufferedReader(new InputStreamReader(ClassLoader.getSystemResourceAsStream(
                            "data/testdata-mac.txt"), "EUC-JP"));
            List<String> actual = FileLineSeparator.readCharacters(reader);

            Assert.assertNotNull(actual);
            Assert.assertTrue(!actual.isEmpty());

            Assert.assertEquals(8, actual.size());

            Assert.assertEquals("新しい朝がきた。\r", actual.get(0));
            Assert.assertEquals("\r", actual.get(1));
            Assert.assertEquals("希望の朝だ。\r", actual.get(2));
            Assert.assertEquals("\r", actual.get(3));
            Assert.assertEquals("\r", actual.get(4));
            Assert.assertEquals("Hello! World!\r", actual.get(5));
            Assert.assertEquals("こんにちは!世界!\r", actual.get(6));
            Assert.assertEquals("\r", actual.get(7));
        } catch (UnsupportedEncodingException e) {
            Assert.fail();
        }
    }

    @Test
    public void testReadCharactersFromUnix() throws FileNotFoundException {
        try {
            Reader reader =
                        new BufferedReader(new InputStreamReader(
                                ClassLoader.getSystemResourceAsStream(
                                        "data/testdata-unix.txt"), "EUC-JP"));
            List<String> actual = FileLineSeparator.readCharacters(reader);

            Assert.assertNotNull(actual);
            Assert.assertTrue(!actual.isEmpty());

            Assert.assertEquals(8, actual.size());

            Assert.assertEquals("新しい朝がきた。\n", actual.get(0));
            Assert.assertEquals("\n", actual.get(1));
            Assert.assertEquals("希望の朝だ。\n", actual.get(2));
            Assert.assertEquals("\n", actual.get(3));
            Assert.assertEquals("\n", actual.get(4));
            Assert.assertEquals("Hello! World!\n", actual.get(5));
            Assert.assertEquals("こんにちは!世界!\n", actual.get(6));
            Assert.assertEquals("\n", actual.get(7));
        } catch (UnsupportedEncodingException e) {
            Assert.fail();
        }
    }

    @Test
    public void testReadCharactersFromEmptyWin() throws FileNotFoundException {
        try {
            Reader reader =
                    new BufferedReader(new InputStreamReader(ClassLoader.getSystemResourceAsStream(
                            "data/testdata-empty-dos.txt"), "EUC-JP"));
            List<String> actual = FileLineSeparator.readCharacters(reader);

            Assert.assertNotNull(actual);
            Assert.assertTrue(actual.isEmpty());
        } catch (UnsupportedEncodingException e) {
            Assert.fail();
        }
    }

    @Test
    public void testReadCharactersFromEmptyMac() throws FileNotFoundException {
        try {
            Reader reader =
                    new BufferedReader(new InputStreamReader(ClassLoader.getSystemResourceAsStream(
                            "data/testdata-empty-mac.txt"), "EUC-JP"));
            List<String> actual = FileLineSeparator.readCharacters(reader);

            Assert.assertNotNull(actual);
            Assert.assertTrue(actual.isEmpty());
        } catch (UnsupportedEncodingException e) {
            Assert.fail();
        }
    }

    @Test
    public void testReadCharactersFromEmptyUnix() throws FileNotFoundException {
        try {
            Reader reader =
                        new BufferedReader(new InputStreamReader(
                                ClassLoader.getSystemResourceAsStream(
                                        "data/testdata-empty-unix.txt"), "EUC-JP"));
            List<String> actual = FileLineSeparator.readCharacters(reader);

            Assert.assertNotNull(actual);
            Assert.assertTrue(actual.isEmpty());
        } catch (UnsupportedEncodingException e) {
            Assert.fail();
        }
    }
}

テストで使用している入力ファイルの内容もあわせて載せておくのでご参考までに。なお、「こんにちは!世界!」のあとに空行が1行入っている。

新しい朝がきた。

希望の朝だ。


Hello! World!
こんにちは!世界!

おそらくこの実装で問題はないと思うのだが*1、もっとスマートな方法や不具合を見つけた方は教えてください。

*1:ファイルのパターンが足りていないかも