package org.apache.james.mailbox.store.extractor;

import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import org.apache.james.mailbox.extractor.ParsedContent;
import org.apache.james.mailbox.extractor.TextExtractor;
import org.apache.james.mailbox.model.ContentType;
import org.assertj.core.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;

/* loaded from: input_file:org/apache/james/mailbox/store/extractor/JsoupTextExtractorTest.class */
class JsoupTextExtractorTest {
    private static final ContentType TEXT_HTML_CONTENT_TYPE = ContentType.of("text/html");
    private static final String HTML_TEXT_CONTENT = "HTML pages can include a lot of null '��' character. But still expecting the content can be parsed.Jsoup 1.12.1 thinks a file containing more than 10 null characters can be a binary file";
    private static final String NULL_CHARACTERS = "��������������������";
    private static final String FULL_CONTENT = "HTML pages can include a lot of null '��' character. But still expecting the content can be parsed.Jsoup 1.12.1 thinks a file containing more than 10 null characters can be a binary file��������������������";
    TextExtractor textExtractor;

    JsoupTextExtractorTest() {
    }

    @BeforeEach
    void setUp() {
        this.textExtractor = new JsoupTextExtractor();
    }

    @Test
    void extractedTextFromHtmlShouldNotContainTheContentOfTitleTag() throws Exception {
        Assertions.assertThat((String) this.textExtractor.extractContent(ClassLoader.getSystemResourceAsStream("documents/html.txt"), TEXT_HTML_CONTENT_TYPE).getTextualContent().get()).doesNotContain(new CharSequence[]{"*|MC:SUBJECT|*"});
    }

    @Test
    void extractContentShouldHandlePlainText() throws Exception {
        Assertions.assertThat(this.textExtractor.extractContent(new ByteArrayInputStream("myText".getBytes(StandardCharsets.UTF_8)), ContentType.of("text/plain")).getTextualContent()).contains("myText");
    }

    @Test
    void extractContentShouldHandlePlainTextWithCharset() throws Exception {
        Assertions.assertThat(this.textExtractor.extractContent(new ByteArrayInputStream("myText".getBytes(StandardCharsets.UTF_8)), ContentType.of("text/plain; charset=utf-8")).getTextualContent()).contains("myText");
    }

    @Test
    void extractContentShouldHandleArbitraryTextMediaType() throws Exception {
        Assertions.assertThat(this.textExtractor.extractContent(new ByteArrayInputStream("myText".getBytes(StandardCharsets.UTF_8)), ContentType.of("text/arbitrary")).getTextualContent()).isEmpty();
    }

    @Test
    void extractContentShouldReturnEmptyWhenNullData() throws Exception {
        Assertions.assertThat(this.textExtractor.extractContent((InputStream) null, TEXT_HTML_CONTENT_TYPE)).isEqualTo(ParsedContent.empty());
    }

    @Test
    void extractContentShouldReturnEmptyWhenNullContentType() throws Exception {
        Assertions.assertThat(this.textExtractor.extractContent(ClassLoader.getSystemResourceAsStream("documents/html.txt"), (ContentType) null)).isEqualTo(ParsedContent.empty());
    }

    @Test
    void extractContentShouldNotThrowWhenContainingNullCharacters() throws Exception {
        Assertions.assertThat(this.textExtractor.extractContent(textContentWithManyNullCharacters(), TEXT_HTML_CONTENT_TYPE).getTextualContent()).hasValueSatisfying(str -> {
            Assertions.assertThat(str).contains(new CharSequence[]{HTML_TEXT_CONTENT});
        });
    }

    @Test
    void extractContentShouldTakeIntoAccountCharsetWhenPlainText() throws Exception {
        InputStream systemResourceAsStream = ClassLoader.getSystemResourceAsStream("documents/simple-text-iso-8859-1.txt");
        Assertions.assertThat(systemResourceAsStream).isNotNull();
        Assertions.assertThat(this.textExtractor.extractContent(systemResourceAsStream, ContentType.of("text/plain; charset=ISO-8859-1")).getTextualContent()).contains("\"é\" This text is not UTF-8 \"à\"");
    }

    @Test
    void extractContentTakeIntoAccountCharsetWhenHTML() throws Exception {
        InputStream systemResourceAsStream = ClassLoader.getSystemResourceAsStream("documents/html-iso-8859-1.html");
        Assertions.assertThat(systemResourceAsStream).isNotNull();
        Assertions.assertThat(this.textExtractor.extractContent(systemResourceAsStream, ContentType.of("text/html; charset=ISO-8859-1")).getTextualContent()).contains("\"é\" this is a simple HTML text \"à\"");
    }

    private InputStream textContentWithManyNullCharacters() {
        return new ByteArrayInputStream(FULL_CONTENT.getBytes(StandardCharsets.UTF_8));
    }
}
