package org.apache.tika.parser.txt;

import java.io.ByteArrayInputStream;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.WriteOutContentHandler;
import org.junit.Assert;
import org.junit.Test;
import org.xml.sax.helpers.DefaultHandler;

/* loaded from: input_file:org/apache/tika/parser/txt/TXTParserTest.class */
public class TXTParserTest extends TikaTest {
    private Parser parser = new TXTParser();

    @Test
    public void testEnglishText() throws Exception {
        Metadata metadata = new Metadata();
        StringWriter stringWriter = new StringWriter();
        this.parser.parse(new ByteArrayInputStream("Hello, World! This is simple UTF-8 text content written in English to test autodetection of both the character encoding and the language of the input stream.".getBytes(StandardCharsets.ISO_8859_1)), new WriteOutContentHandler(stringWriter), metadata, new ParseContext());
        String stringWriter2 = stringWriter.toString();
        Assert.assertEquals("text/plain; charset=ISO-8859-1", metadata.get("Content-Type"));
        Assert.assertNull(metadata.get("Content-Language"));
        Assert.assertNull(metadata.get(TikaCoreProperties.LANGUAGE));
        TikaTest.assertContains("Hello", stringWriter2);
        TikaTest.assertContains("World", stringWriter2);
        TikaTest.assertContains("autodetection", stringWriter2);
        TikaTest.assertContains("stream", stringWriter2);
    }

    @Test
    public void testUTF8Text() throws Exception {
        BodyContentHandler bodyContentHandler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        this.parser.parse(new ByteArrayInputStream("Iñtërnâtiônàlizætiøn".getBytes(StandardCharsets.UTF_8)), bodyContentHandler, metadata, new ParseContext());
        Assert.assertEquals("text/plain; charset=UTF-8", metadata.get("Content-Type"));
        Assert.assertEquals("UTF-8", metadata.get("Content-Encoding"));
        TikaTest.assertContains("Iñtërnâtiônàlizætiøn", bodyContentHandler.toString());
    }

    @Test
    public void testEmptyText() throws Exception {
        BodyContentHandler bodyContentHandler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        this.parser.parse(new ByteArrayInputStream(new byte[0]), bodyContentHandler, metadata, new ParseContext());
        Assert.assertEquals("text/plain; charset=UTF-8", metadata.get("Content-Type"));
        Assert.assertEquals("\n", bodyContentHandler.toString());
    }

    @Test
    public void testLatinDetectionHeuristics() throws Exception {
        Metadata metadata = new Metadata();
        this.parser.parse(new ByteArrayInputStream("test\r\n".getBytes("ISO-8859-15")), new DefaultHandler(), metadata, new ParseContext());
        Assert.assertEquals("text/plain; charset=windows-1252", metadata.get("Content-Type"));
        Metadata metadata2 = new Metadata();
        this.parser.parse(new ByteArrayInputStream("test\n".getBytes("ISO-8859-15")), new DefaultHandler(), metadata2, new ParseContext());
        Assert.assertEquals("text/plain; charset=ISO-8859-1", metadata2.get("Content-Type"));
        Metadata metadata3 = new Metadata();
        this.parser.parse(new ByteArrayInputStream("test €\n".getBytes("ISO-8859-15")), new DefaultHandler(), metadata3, new ParseContext());
        Assert.assertEquals("text/plain; charset=ISO-8859-15", metadata3.get("Content-Type"));
    }

    @Test
    public void testDropByteOrderMark() throws Exception {
        assertExtractText("UTF-8 BOM", "test", new byte[]{-17, -69, -65, 116, 101, 115, 116});
        assertExtractText("UTF-16 BE BOM", "test", new byte[]{-2, -1, 0, 116, 0, 101, 0, 115, 0, 116});
        assertExtractText("UTF-16 LE BOM", "test", new byte[]{-1, -2, 116, 0, 101, 0, 115, 0, 116, 0});
    }

    @Test
    public void testUseIncomingCharsetAsHint() throws Exception {
        Metadata metadata = new Metadata();
        this.parser.parse(new ByteArrayInputStream("the name is ándre".getBytes(StandardCharsets.ISO_8859_1)), new BodyContentHandler(), metadata, new ParseContext());
        Assert.assertEquals("text/plain; charset=ISO-8859-1", metadata.get("Content-Type"));
        Assert.assertEquals("ISO-8859-1", metadata.get("Content-Encoding"));
        metadata.set("Content-Type", "text/plain; charset=ISO-8859-15");
        this.parser.parse(new ByteArrayInputStream("the name is ándre".getBytes(StandardCharsets.ISO_8859_1)), new BodyContentHandler(), metadata, new ParseContext());
        Assert.assertEquals("text/plain; charset=ISO-8859-15", metadata.get("Content-Type"));
        Assert.assertEquals("ISO-8859-15", metadata.get("Content-Encoding"));
    }

    @Test
    public void testUsingCharsetInContentTypeHeader() throws Exception {
        Metadata metadata = new Metadata();
        this.parser.parse(new ByteArrayInputStream("the name is ándre".getBytes(StandardCharsets.ISO_8859_1)), new BodyContentHandler(), metadata, new ParseContext());
        Assert.assertEquals("text/plain; charset=ISO-8859-1", metadata.get("Content-Type"));
        Assert.assertEquals("ISO-8859-1", metadata.get("Content-Encoding"));
        Metadata metadata2 = new Metadata();
        metadata2.set("Content-Type", "text/html; charset=ISO-8859-15");
        this.parser.parse(new ByteArrayInputStream("the name is ándre".getBytes(StandardCharsets.ISO_8859_1)), new BodyContentHandler(), metadata2, new ParseContext());
        Assert.assertEquals("text/html; charset=ISO-8859-15", metadata2.get("Content-Type"));
        Assert.assertEquals("ISO-8859-15", metadata2.get("Content-Encoding"));
    }

    private void assertExtractText(String str, String str2, byte[] bArr) throws Exception {
        BodyContentHandler bodyContentHandler = new BodyContentHandler() { // from class: org.apache.tika.parser.txt.TXTParserTest.1
            public void ignorableWhitespace(char[] cArr, int i, int i2) {
            }
        };
        this.parser.parse(new ByteArrayInputStream(bArr), bodyContentHandler, new Metadata(), new ParseContext());
        Assert.assertEquals(str, str2, bodyContentHandler.toString());
    }

    @Test
    public void testRetainIncomingLanguage() throws Exception {
        Metadata metadata = new Metadata();
        metadata.set(TikaCoreProperties.LANGUAGE, "en");
        this.parser.parse(new ByteArrayInputStream("Simple Content".getBytes(StandardCharsets.UTF_8)), new BodyContentHandler(), metadata, new ParseContext());
        Assert.assertEquals("en", metadata.get(TikaCoreProperties.LANGUAGE));
    }

    @Test
    public void testCP866() throws Exception {
        Metadata metadata = new Metadata();
        this.parser.parse(getResourceAsStream("/test-documents/russian.cp866.txt"), new WriteOutContentHandler(new StringWriter()), metadata, new ParseContext());
        Assert.assertEquals("text/plain; charset=IBM866", metadata.get("Content-Type"));
    }

    @Test
    public void testEBCDIC_CP500() throws Exception {
        Metadata metadata = new Metadata();
        this.parser.parse(getResourceAsStream("/test-documents/english.cp500.txt"), new WriteOutContentHandler(new StringWriter()), metadata, new ParseContext());
        Assert.assertEquals("text/plain; charset=IBM500", metadata.get("Content-Type"));
        Metadata metadata2 = new Metadata();
        this.parser.parse(new ByteArrayInputStream("<html><body>hello world</body></html>".getBytes(StandardCharsets.ISO_8859_1)), new WriteOutContentHandler(new StringWriter()), metadata2, new ParseContext());
        Assert.assertEquals("text/plain; charset=ISO-8859-1", metadata2.get("Content-Type"));
    }

    @Test
    public void testCharsetDetectionWithShortSnipet() throws Exception {
        Metadata metadata = new Metadata();
        this.parser.parse(new ByteArrayInputStream("Hello, World!".getBytes(StandardCharsets.UTF_8)), new BodyContentHandler(), metadata, new ParseContext());
        Assert.assertEquals("text/plain; charset=ISO-8859-1", metadata.get("Content-Type"));
        metadata.set("Content-Type", "application/binary; charset=UTF-8");
        this.parser.parse(new ByteArrayInputStream("Hello, World!".getBytes(StandardCharsets.UTF_8)), new BodyContentHandler(), metadata, new ParseContext());
        Assert.assertEquals("application/binary; charset=UTF-8", metadata.get("Content-Type"));
    }

    @Test
    public void testSubclassingMimeTypesRemain() throws Exception {
        Assert.assertEquals("text/x-vcalendar; charset=ISO-8859-1", getXML("testVCalendar.vcs").metadata.get("Content-Type"));
    }
}
