001/*
002 * Copyright 2023 the original author or authors.
003 * <p>
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 * <p>
008 * https://www.apache.org/licenses/LICENSE-2.0
009 * <p>
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package de.cuioss.tools.string;
017
018import static de.cuioss.tools.string.MoreStrings.isEmpty;
019import static de.cuioss.tools.string.MoreStrings.nullToEmpty;
020import static java.lang.Integer.valueOf;
021
022import java.io.Serializable;
023import java.util.ArrayList;
024import java.util.List;
025import java.util.Map;
026import java.util.Map.Entry;
027import java.util.regex.Pattern;
028
029import de.cuioss.tools.collect.MapBuilder;
030import lombok.EqualsAndHashCode;
031import lombok.Getter;
032import lombok.Setter;
033import lombok.ToString;
034
035/**
036 * This class provide functionality to transform long text to several html
037 * useful representation and encapsulate this as an object. It is implemented as
038 * an value-object keeping the calculated text.
039 *
040 * @author Eugen Fischer
041 */
042@EqualsAndHashCode(of = { "source", "abridgedLength", "forceLengthBreak" })
043@ToString(of = { "source", "abridgedLength", "forceLengthBreak" })
044public class TextSplitter implements Serializable {
045
046    /** serial Version UID */
047    private static final long serialVersionUID = 6594890288982910944L;
048
049    /**
050     * Der Browser muss über Sollbruchstellen die Möglichkeit bekommen lange
051     * Wortketten zu trennen/umzubrechen. Dafür gibt es zwei unsichtbare Zeichen,
052     * die in den HTML code eingebaut werden können: "&amp;shy;" und "&amp;#8203;".
053     * Der Unterschied zwischen beiden ist, dass das eine einfach ein Leerzeichen
054     * ohne breite ist, welches beim Umbruch keine Spuren hinterlässt, das andere
055     * fügt bei einem Umbruch einen Bindestrich hinzu. Eignet sich also zur
056     * Silbentrennung.
057     */
058    private static final String ZERO_WIDTH_SPACE = "\u200B";
059
060    private static final String TRADE_STR = "...";
061
062    private static final int DEFAULT_FORCE_LENGTH_BREAK = 15;
063
064    private static final int DEFAULT_ABRIDGED_LENGTH = 20;
065
066    private static final Map<Pattern, String> REPLACEMENT_MAP = new MapBuilder<Pattern, String>()
067            .put(Pattern.compile("#"), "#" + ZERO_WIDTH_SPACE).put(Pattern.compile("\\+"), "+" + ZERO_WIDTH_SPACE)
068            .put(Pattern.compile("-"), "-" + ZERO_WIDTH_SPACE).put(Pattern.compile("_"), "_" + ZERO_WIDTH_SPACE)
069            .put(Pattern.compile("\\."), "." + ZERO_WIDTH_SPACE).put(Pattern.compile("\\?"), "?" + ZERO_WIDTH_SPACE)
070            .put(Pattern.compile("!"), "!" + ZERO_WIDTH_SPACE).put(Pattern.compile(":"), ":" + ZERO_WIDTH_SPACE)
071            .put(Pattern.compile(","), "," + ZERO_WIDTH_SPACE).put(Pattern.compile(";"), ";" + ZERO_WIDTH_SPACE)
072            .toImmutableMap();
073
074    private final String source;
075
076    @Getter(lazy = true)
077    private final String abridgedText = initAbridged();
078
079    @Getter
080    private boolean abridged = false;
081
082    @Getter(lazy = true)
083    private final String textWithEnforcedLineBreaks = initTextWithLineBreaks();
084
085    @Setter
086    private Integer forceLengthBreak = null;
087
088    @Setter
089    private Integer abridgedLength = null;
090
091    /**
092     * Construct TextSplitter.
093     *
094     * @param longString source text which will be processed
095     */
096    public TextSplitter(final String longString) {
097        source = nullToEmpty(longString);
098    }
099
100    /**
101     * Alternative Constructor
102     *
103     * @param source                target text
104     * @param forceLengthBreakCount count of characters when a text break will
105     *                              forced
106     * @param abridgedLengthCount   count of characters
107     */
108    public TextSplitter(final String source, final int forceLengthBreakCount, final int abridgedLengthCount) {
109
110        this.source = source;
111        forceLengthBreak = valueOf(forceLengthBreakCount);
112        abridgedLength = valueOf(abridgedLengthCount);
113    }
114
115    private int getForceLengthBreak() {
116        if (null == forceLengthBreak) {
117            return DEFAULT_FORCE_LENGTH_BREAK;
118        }
119        return forceLengthBreak;
120    }
121
122    private int getAbridgedLength() {
123        if (null == abridgedLength) {
124            return DEFAULT_ABRIDGED_LENGTH;
125        }
126        return abridgedLength;
127    }
128
129    private String initAbridged() {
130        var result = "";
131
132        if (!isEmpty(source)) {
133
134            final var sourceSplitted = getSourceSplitted();
135
136            if (sourceSplitted.size() == 1) {
137                result = abridgeComputerProducedText();
138            } else {
139                result = abridgeHumanProducedText(sourceSplitted);
140            }
141        }
142
143        abridged = endsWith(result, TRADE_STR);
144
145        return result.trim();
146    }
147
148    private static boolean endsWith(final String str, final String suffix) {
149        return str.trim().endsWith(suffix);
150    }
151
152    /**
153     * @return abridged text
154     */
155    private String abridgeComputerProducedText() {
156        final var maxLength = getAbridgedLength() - (TRADE_STR.length() + 1);
157        if (source.length() > maxLength) {
158            return source.substring(0, maxLength) + " ...";
159        }
160        return source;
161    }
162
163    /**
164     * @param sourceSplitted
165     * @return abridged text
166     */
167    private String abridgeHumanProducedText(final List<String> sourceSplitted) {
168        final var maxLength = getAbridgedLength() - TRADE_STR.length();
169        final var builder = new StringBuilder();
170        var count = 0;
171        for (final String part : sourceSplitted) {
172            count = count + part.length();
173            if (count >= maxLength) {
174                builder.append(TRADE_STR);
175                break;
176            }
177
178            builder.append(part).append(" ");
179
180            count = count + 1;
181        }
182        return builder.toString();
183    }
184
185    private String initTextWithLineBreaks() {
186
187        var result = "";
188
189        if (!isEmpty(source)) {
190            final var sourceSplitted = getSourceSplitted();
191            if (sourceSplitted.size() == 1) {
192                result = forceLineBreakForComputerProducedText(source);
193            } else {
194                result = forceLineBreakForHumanProducedText(sourceSplitted);
195            }
196        }
197
198        return result.trim();
199    }
200
201    private String forceLineBreakForHumanProducedText(final List<String> sourceSplitted) {
202        final var builder = new StringBuilder();
203        for (final String text : sourceSplitted) {
204            builder.append(forceLineBreakForComputerProducedText(text)).append(" ");
205        }
206        return builder.toString();
207    }
208
209    /**
210     * Try to separate text target on native text breaks. If this is not enough use
211     * brute-force on max allowed length.
212     *
213     * @param text target which will be analyzed
214     * @return
215     */
216    private String forceLineBreakForComputerProducedText(final String text) {
217
218        // try to separate on native text breaks
219        var clean = text;
220        for (final Entry<Pattern, String> entry : REPLACEMENT_MAP.entrySet()) {
221            final var matcher = entry.getKey().matcher(clean);
222            clean = matcher.replaceAll(entry.getValue());
223        }
224
225        final var splittedByZeroWidthSpace = getSplittedByZeroWidthSpace(clean);
226        final List<String> lengthTrimed = new ArrayList<>();
227
228        for (final String item : splittedByZeroWidthSpace) {
229            lengthTrimed.add(bruteForceSplit(item));
230        }
231
232        return Joiner.on(ZERO_WIDTH_SPACE).join(lengthTrimed);
233    }
234
235    /**
236     * Verify if very long text still exists and execute brute-force dissipation
237     *
238     * @param text target
239     * @return fragmented text if length doesn't fit to force length break
240     */
241    private String bruteForceSplit(final String text) {
242        final var maxLength = getForceLengthBreak();
243        if (!isEmpty(text)) {
244            final var builder = new StringBuilder();
245            var tmp = text;
246            while (tmp.length() > maxLength) {
247                builder.append(tmp, 0, maxLength).append(ZERO_WIDTH_SPACE);
248                tmp = tmp.substring(maxLength);
249            }
250            if (!tmp.isEmpty()) {
251                builder.append(tmp);
252            }
253            return builder.toString();
254        }
255        return text;
256    }
257
258    private static List<String> getSplittedByZeroWidthSpace(final String value) {
259        return Splitter.on(ZERO_WIDTH_SPACE).splitToList(value);
260    }
261
262    private List<String> getSourceSplitted() {
263        return Splitter.on(" ").splitToList(source);
264    }
265}