001/* 002 * Copyright 2023 the original author or authors. 003 * <p> 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * <p> 008 * https://www.apache.org/licenses/LICENSE-2.0 009 * <p> 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package de.cuioss.tools.string; 017 018import static de.cuioss.tools.string.MoreStrings.isEmpty; 019import static de.cuioss.tools.string.MoreStrings.nullToEmpty; 020import static java.lang.Integer.valueOf; 021 022import java.io.Serializable; 023import java.util.ArrayList; 024import java.util.List; 025import java.util.Map; 026import java.util.Map.Entry; 027import java.util.regex.Pattern; 028 029import de.cuioss.tools.collect.MapBuilder; 030import lombok.EqualsAndHashCode; 031import lombok.Getter; 032import lombok.Setter; 033import lombok.ToString; 034 035/** 036 * This class provide functionality to transform long text to several html 037 * useful representation and encapsulate this as an object. It is implemented as 038 * an value-object keeping the calculated text. 039 * 040 * @author Eugen Fischer 041 */ 042@EqualsAndHashCode(of = { "source", "abridgedLength", "forceLengthBreak" }) 043@ToString(of = { "source", "abridgedLength", "forceLengthBreak" }) 044public class TextSplitter implements Serializable { 045 046 /** serial Version UID */ 047 private static final long serialVersionUID = 6594890288982910944L; 048 049 /** 050 * Der Browser muss über Sollbruchstellen die Möglichkeit bekommen lange 051 * Wortketten zu trennen/umzubrechen. Dafür gibt es zwei unsichtbare Zeichen, 052 * die in den HTML code eingebaut werden können: "&shy;" und "&#8203;". 053 * Der Unterschied zwischen beiden ist, dass das eine einfach ein Leerzeichen 054 * ohne breite ist, welches beim Umbruch keine Spuren hinterlässt, das andere 055 * fügt bei einem Umbruch einen Bindestrich hinzu. Eignet sich also zur 056 * Silbentrennung. 057 */ 058 private static final String ZERO_WIDTH_SPACE = "\u200B"; 059 060 private static final String TRADE_STR = "..."; 061 062 private static final int DEFAULT_FORCE_LENGTH_BREAK = 15; 063 064 private static final int DEFAULT_ABRIDGED_LENGTH = 20; 065 066 private static final Map<Pattern, String> REPLACEMENT_MAP = new MapBuilder<Pattern, String>() 067 .put(Pattern.compile("#"), "#" + ZERO_WIDTH_SPACE).put(Pattern.compile("\\+"), "+" + ZERO_WIDTH_SPACE) 068 .put(Pattern.compile("-"), "-" + ZERO_WIDTH_SPACE).put(Pattern.compile("_"), "_" + ZERO_WIDTH_SPACE) 069 .put(Pattern.compile("\\."), "." + ZERO_WIDTH_SPACE).put(Pattern.compile("\\?"), "?" + ZERO_WIDTH_SPACE) 070 .put(Pattern.compile("!"), "!" + ZERO_WIDTH_SPACE).put(Pattern.compile(":"), ":" + ZERO_WIDTH_SPACE) 071 .put(Pattern.compile(","), "," + ZERO_WIDTH_SPACE).put(Pattern.compile(";"), ";" + ZERO_WIDTH_SPACE) 072 .toImmutableMap(); 073 074 private final String source; 075 076 @Getter(lazy = true) 077 private final String abridgedText = initAbridged(); 078 079 @Getter 080 private boolean abridged = false; 081 082 @Getter(lazy = true) 083 private final String textWithEnforcedLineBreaks = initTextWithLineBreaks(); 084 085 @Setter 086 private Integer forceLengthBreak = null; 087 088 @Setter 089 private Integer abridgedLength = null; 090 091 /** 092 * Construct TextSplitter. 093 * 094 * @param longString source text which will be processed 095 */ 096 public TextSplitter(final String longString) { 097 source = nullToEmpty(longString); 098 } 099 100 /** 101 * Alternative Constructor 102 * 103 * @param source target text 104 * @param forceLengthBreakCount count of characters when a text break will 105 * forced 106 * @param abridgedLengthCount count of characters 107 */ 108 public TextSplitter(final String source, final int forceLengthBreakCount, final int abridgedLengthCount) { 109 110 this.source = source; 111 forceLengthBreak = valueOf(forceLengthBreakCount); 112 abridgedLength = valueOf(abridgedLengthCount); 113 } 114 115 private int getForceLengthBreak() { 116 if (null == forceLengthBreak) { 117 return DEFAULT_FORCE_LENGTH_BREAK; 118 } 119 return forceLengthBreak; 120 } 121 122 private int getAbridgedLength() { 123 if (null == abridgedLength) { 124 return DEFAULT_ABRIDGED_LENGTH; 125 } 126 return abridgedLength; 127 } 128 129 private String initAbridged() { 130 var result = ""; 131 132 if (!isEmpty(source)) { 133 134 final var sourceSplitted = getSourceSplitted(); 135 136 if (sourceSplitted.size() == 1) { 137 result = abridgeComputerProducedText(); 138 } else { 139 result = abridgeHumanProducedText(sourceSplitted); 140 } 141 } 142 143 abridged = endsWith(result, TRADE_STR); 144 145 return result.trim(); 146 } 147 148 private static boolean endsWith(final String str, final String suffix) { 149 return str.trim().endsWith(suffix); 150 } 151 152 /** 153 * @return abridged text 154 */ 155 private String abridgeComputerProducedText() { 156 final var maxLength = getAbridgedLength() - (TRADE_STR.length() + 1); 157 if (source.length() > maxLength) { 158 return source.substring(0, maxLength) + " ..."; 159 } 160 return source; 161 } 162 163 /** 164 * @param sourceSplitted 165 * @return abridged text 166 */ 167 private String abridgeHumanProducedText(final List<String> sourceSplitted) { 168 final var maxLength = getAbridgedLength() - TRADE_STR.length(); 169 final var builder = new StringBuilder(); 170 var count = 0; 171 for (final String part : sourceSplitted) { 172 count = count + part.length(); 173 if (count >= maxLength) { 174 builder.append(TRADE_STR); 175 break; 176 } 177 178 builder.append(part).append(" "); 179 180 count = count + 1; 181 } 182 return builder.toString(); 183 } 184 185 private String initTextWithLineBreaks() { 186 187 var result = ""; 188 189 if (!isEmpty(source)) { 190 final var sourceSplitted = getSourceSplitted(); 191 if (sourceSplitted.size() == 1) { 192 result = forceLineBreakForComputerProducedText(source); 193 } else { 194 result = forceLineBreakForHumanProducedText(sourceSplitted); 195 } 196 } 197 198 return result.trim(); 199 } 200 201 private String forceLineBreakForHumanProducedText(final List<String> sourceSplitted) { 202 final var builder = new StringBuilder(); 203 for (final String text : sourceSplitted) { 204 builder.append(forceLineBreakForComputerProducedText(text)).append(" "); 205 } 206 return builder.toString(); 207 } 208 209 /** 210 * Try to separate text target on native text breaks. If this is not enough use 211 * brute-force on max allowed length. 212 * 213 * @param text target which will be analyzed 214 * @return 215 */ 216 private String forceLineBreakForComputerProducedText(final String text) { 217 218 // try to separate on native text breaks 219 var clean = text; 220 for (final Entry<Pattern, String> entry : REPLACEMENT_MAP.entrySet()) { 221 final var matcher = entry.getKey().matcher(clean); 222 clean = matcher.replaceAll(entry.getValue()); 223 } 224 225 final var splittedByZeroWidthSpace = getSplittedByZeroWidthSpace(clean); 226 final List<String> lengthTrimed = new ArrayList<>(); 227 228 for (final String item : splittedByZeroWidthSpace) { 229 lengthTrimed.add(bruteForceSplit(item)); 230 } 231 232 return Joiner.on(ZERO_WIDTH_SPACE).join(lengthTrimed); 233 } 234 235 /** 236 * Verify if very long text still exists and execute brute-force dissipation 237 * 238 * @param text target 239 * @return fragmented text if length doesn't fit to force length break 240 */ 241 private String bruteForceSplit(final String text) { 242 final var maxLength = getForceLengthBreak(); 243 if (!isEmpty(text)) { 244 final var builder = new StringBuilder(); 245 var tmp = text; 246 while (tmp.length() > maxLength) { 247 builder.append(tmp, 0, maxLength).append(ZERO_WIDTH_SPACE); 248 tmp = tmp.substring(maxLength); 249 } 250 if (!tmp.isEmpty()) { 251 builder.append(tmp); 252 } 253 return builder.toString(); 254 } 255 return text; 256 } 257 258 private static List<String> getSplittedByZeroWidthSpace(final String value) { 259 return Splitter.on(ZERO_WIDTH_SPACE).splitToList(value); 260 } 261 262 private List<String> getSourceSplitted() { 263 return Splitter.on(" ").splitToList(source); 264 } 265}