001/*
002 * Copyright © 2025 CUI-OpenSource-Software (info@cuioss.de)
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package de.cuioss.http.security.validation;
017
018import de.cuioss.http.security.config.SecurityConfiguration;
019import de.cuioss.http.security.core.HttpSecurityValidator;
020import de.cuioss.http.security.core.UrlSecurityFailureType;
021import de.cuioss.http.security.core.ValidationType;
022import de.cuioss.http.security.exceptions.UrlSecurityException;
023import lombok.EqualsAndHashCode;
024import lombok.ToString;
025import org.jspecify.annotations.Nullable;
026
027import java.util.BitSet;
028import java.util.Optional;
029
030/**
031 * Character validation stage that enforces RFC-compliant character sets for HTTP components.
032 *
033 * <p>This stage validates input characters against component-specific allowed character sets,
034 * ensuring compliance with HTTP specifications and preventing character-based security attacks.
035 * It performs comprehensive character validation including null byte detection, control character
036 * filtering, and percent-encoding validation.</p>
037 *
038 * <h3>Design Principles</h3>
039 * <ul>
040 *   <li><strong>RFC Compliance</strong> - Enforces RFC 3986 (URI) and RFC 7230 (HTTP) character rules</li>
041 *   <li><strong>Security First</strong> - Rejects dangerous characters before any processing</li>
042 *   <li><strong>Context Aware</strong> - Different character sets for different HTTP components</li>
043 *   <li><strong>Performance Optimized</strong> - Uses BitSet for O(1) character lookups</li>
044 *   <li><strong>Configurable</strong> - Allows fine-tuning of character validation rules</li>
045 * </ul>
046 *
047 * <h3>Character Validation Rules</h3>
048 * <ul>
049 *   <li><strong>URL Paths</strong> - RFC 3986 unreserved + path-specific characters</li>
050 *   <li><strong>Parameters</strong> - RFC 3986 query characters with percent-encoding support</li>
051 *   <li><strong>Headers</strong> - RFC 7230 visible ASCII minus delimiters</li>
052 *   <li><strong>Cookies</strong> - Restricted character set for cookie safety</li>
053 *   <li><strong>Bodies</strong> - Content-type specific character validation</li>
054 * </ul>
055 *
056 * <h3>Security Features</h3>
057 * <ul>
058 *   <li><strong>Null Byte Detection</strong> - Prevents null byte injection attacks</li>
059 *   <li><strong>Control Character Filtering</strong> - Blocks dangerous control characters</li>
060 *   <li><strong>Percent Encoding Validation</strong> - Validates hex digit sequences</li>
061 *   <li><strong>High-Bit Character Control</strong> - Configurable handling of non-ASCII characters</li>
062 * </ul>
063 *
064 * <h3>Usage Examples</h3>
065 * <pre>
066 * // Create character validation stage
067 * SecurityConfiguration config = SecurityConfiguration.defaults();
068 * CharacterValidationStage validator = new CharacterValidationStage(config, ValidationType.URL_PATH);
069 *
070 * // Validate URL path characters
071 * try {
072 *     validator.validate("/api/users/123"); // Valid path characters
073 *     validator.validate("/api/../etc/passwd"); // May contain invalid traversal patterns
074 * } catch (UrlSecurityException e) {
075 *     logger.warn("Invalid characters detected: {}", e.getFailureType());
076 * }
077 *
078 * // Validate parameter with percent encoding
079 * CharacterValidationStage paramValidator = new CharacterValidationStage(config, ValidationType.PARAMETER_VALUE);
080 * try {
081 *     paramValidator.validate("hello%20world"); // Valid percent-encoded space
082 *     paramValidator.validate("hello%00world"); // Null byte - will be rejected
083 * } catch (UrlSecurityException e) {
084 *     logger.warn("Character validation failed: {}", e.getDetail());
085 * }
086 * </pre>
087 *
088 * <h3>Configuration Options</h3>
089 * <ul>
090 *   <li><strong>allowNullBytes</strong> - Whether to permit null bytes (default: false)</li>
091 *   <li><strong>allowControlCharacters</strong> - Whether to permit control characters (default: false)</li>
092 *   <li><strong>allowExtendedAscii</strong> - Whether to permit extended ASCII characters (128-255).
093 *       <ul>
094 *         <li>For URL paths and parameters: Allows characters 128-255 when enabled</li>
095 *         <li>For header names and cookies: Always rejected per RFC (setting ignored)</li>
096 *         <li>For header values and body: Enables both extended ASCII and Unicode support</li>
097 *         <li>Note: Unicode beyond 255 is always rejected for URLs per RFC 3986</li>
098 *       </ul>
099 *       (default: false)</li>
100 * </ul>
101 *
102 * <h3>Performance Characteristics</h3>
103 * <ul>
104 *   <li>O(n) time complexity where n is input length</li>
105 *   <li>O(1) character lookup using BitSet</li>
106 *   <li>Early termination on first invalid character</li>
107 *   <li>Minimal memory allocation during validation</li>
108 * </ul>
109 *
110 * @see CharacterValidationConstants
111 * @see SecurityConfiguration
112 * @see ValidationType
113 * @since 1.0
114 */
115@EqualsAndHashCode
116@ToString
117public final class CharacterValidationStage implements HttpSecurityValidator {
118
119    private final BitSet allowedChars;
120    private final ValidationType validationType;
121    private final boolean allowPercentEncoding;
122    private final boolean allowNullBytes;
123    private final boolean allowControlCharacters;
124    private final boolean allowExtendedAscii;
125
126    public CharacterValidationStage(SecurityConfiguration config, ValidationType type) {
127        this.validationType = type;
128        this.allowNullBytes = config.allowNullBytes();
129        this.allowControlCharacters = config.allowControlCharacters();
130        this.allowExtendedAscii = config.allowExtendedAscii();
131        // Use the shared BitSet directly - it's read-only after initialization
132        this.allowedChars = CharacterValidationConstants.getCharacterSet(type);
133
134        // Determine if percent encoding is allowed based on type
135        this.allowPercentEncoding = switch (type) {
136            case URL_PATH, PARAMETER_NAME, PARAMETER_VALUE -> true;
137            default -> false;  // HEADER_NAME, HEADER_VALUE and others don't allow percent encoding
138        };
139    }
140
141    @Override
142    @SuppressWarnings("squid:S3516")
143    public Optional<String> validate(@Nullable String value) throws UrlSecurityException {
144        // Quick check for null/empty
145        if (value == null) {
146            return Optional.empty();
147        }
148        if (value.isEmpty()) {
149            return Optional.of(value);
150        }
151
152        validateCharacters(value);
153        return Optional.of(value);
154    }
155
156    /**
157     * Validates all characters in the input string.
158     * @param value The string to validate
159     * @throws UrlSecurityException if any character validation fails
160     */
161    private void validateCharacters(String value) throws UrlSecurityException {
162        int i = 0;
163        while (i < value.length()) {
164            char ch = value.charAt(i);
165
166            // Check for null byte FIRST (highest priority security check)
167            if (ch == '\0') {
168                handleNullByte(value, i);
169            }
170
171            // Handle percent encoding
172            if (ch == '%' && allowPercentEncoding) {
173                validatePercentEncoding(value, i);
174                i += 3; // Skip the percent sign and two hex digits
175                continue;
176            }
177
178            // Check if character is allowed based on configuration and character sets
179            if (!isCharacterAllowed(ch)) {
180                handleInvalidCharacter(value, ch, i);
181            }
182            i++;
183        }
184    }
185
186    /**
187     * Handles null byte detection.
188     * @param value The original input string
189     * @param position The position of the null byte
190     * @throws UrlSecurityException if null bytes are not allowed
191     */
192    private void handleNullByte(String value, int position) throws UrlSecurityException {
193        if (!allowNullBytes) {
194            throw UrlSecurityException.builder()
195                    .failureType(UrlSecurityFailureType.NULL_BYTE_INJECTION)
196                    .validationType(validationType)
197                    .originalInput(value)
198                    .detail("Null byte detected at position " + position)
199                    .build();
200        }
201    }
202
203    /**
204     * Handles invalid character detection.
205     * @param value The original input string
206     * @param ch The invalid character
207     * @param position The position of the invalid character
208     * @throws UrlSecurityException for the invalid character
209     */
210    private void handleInvalidCharacter(String value, char ch, int position) throws UrlSecurityException {
211        UrlSecurityFailureType failureType = getFailureTypeForCharacter(ch);
212        throw UrlSecurityException.builder()
213                .failureType(failureType)
214                .validationType(validationType)
215                .originalInput(value)
216                .detail("Invalid character '" + ch + "' (0x" + Integer.toHexString(ch).toUpperCase() + ") at position " + position)
217                .build();
218    }
219
220    /**
221     * Validates percent encoding at the given position.
222     * @param value The string to validate
223     * @param position The position of the percent sign
224     * @throws UrlSecurityException if the percent encoding is invalid
225     */
226    private void validatePercentEncoding(String value, int position) throws UrlSecurityException {
227        // Must be followed by two hex digits
228        if (position + 2 >= value.length()) {
229            throw UrlSecurityException.builder()
230                    .failureType(UrlSecurityFailureType.INVALID_ENCODING)
231                    .validationType(validationType)
232                    .originalInput(value)
233                    .detail("Incomplete percent encoding at position " + position)
234                    .build();
235        }
236
237        char hex1 = value.charAt(position + 1);
238        char hex2 = value.charAt(position + 2);
239        if (isNotHexDigit(hex1) || isNotHexDigit(hex2)) {
240            throw UrlSecurityException.builder()
241                    .failureType(UrlSecurityFailureType.INVALID_ENCODING)
242                    .validationType(validationType)
243                    .originalInput(value)
244                    .detail("Invalid hex digits in percent encoding at position " + position)
245                    .build();
246        }
247
248        // Check for encoded null byte %00
249        if (hex1 == '0' && hex2 == '0' && !allowNullBytes) {
250            throw UrlSecurityException.builder()
251                    .failureType(UrlSecurityFailureType.NULL_BYTE_INJECTION)
252                    .validationType(validationType)
253                    .originalInput(value)
254                    .detail("Encoded null byte (%00) detected at position " + position)
255                    .build();
256        }
257    }
258
259    private boolean isNotHexDigit(char ch) {
260        return !((ch >= '0' && ch <= '9') ||
261                (ch >= 'A' && ch <= 'F') ||
262                (ch >= 'a' && ch <= 'f'));
263    }
264
265    /**
266     * Checks if a character is allowed based on configuration flags and character sets.
267     */
268    private boolean isCharacterAllowed(char ch) {
269        // Null byte (0) - should be allowed if configured (already checked earlier but may reach here)
270        if (ch == 0) {
271            return allowNullBytes;
272        }
273
274        // Control characters (1-31, excluding null which is handled above)
275        if (ch <= 31) {
276            // Always allow common whitespace characters that are in the base character set
277            if (allowedChars.get(ch)) {
278                return true;
279            }
280            // Other control characters depend on configuration
281            return allowControlCharacters;
282        }
283
284        // Characters 32-127 (basic ASCII) - check against the base character set
285        if (ch <= 127) {
286            return allowedChars.get(ch);
287        }
288
289        // Extended ASCII characters (128-255)
290        // Different validation types have different rules for extended ASCII
291        if (ch <= 255) {
292            // Header names and cookie names/values must be ASCII-only per RFC
293            if (validationType == ValidationType.HEADER_NAME ||
294                    validationType == ValidationType.COOKIE_NAME ||
295                    validationType == ValidationType.COOKIE_VALUE) {
296                return false;  // RFC compliance: these must be ASCII-only
297            }
298            // For other types (URL paths, parameters, header values, body),
299            // allow extended ASCII based on configuration
300            return allowExtendedAscii || allowedChars.get(ch);
301        }
302
303        // Unicode characters above 255:
304        // For URLs (paths/parameters): Always rejected per RFC 3986 (ASCII-only)
305        // For headers/body: Allowed if allowExtendedAscii is true (which enables full Unicode support for these contexts)
306        // Always reject combining characters (U+0300-U+036F) as they can cause normalization issues
307        if (ch >= 0x0300 && ch <= 0x036F) {
308            return false;
309        }
310        // The allowExtendedAscii flag controls both extended ASCII and Unicode for applicable validation types
311        return allowExtendedAscii && supportsUnicodeCharacters();
312    }
313
314    /**
315     * Determines the appropriate failure type for a rejected character.
316     */
317    private UrlSecurityFailureType getFailureTypeForCharacter(char ch) {
318        // Null byte (0)
319        if (ch == 0) {
320            return UrlSecurityFailureType.NULL_BYTE_INJECTION;
321        }
322
323        // Control characters (1-31)
324        if (ch <= 31) {
325            // For headers, control characters are just invalid characters per RFC
326            // For other contexts, they're specifically flagged as control characters for security
327            if (validationType == ValidationType.HEADER_NAME || validationType == ValidationType.HEADER_VALUE) {
328                return UrlSecurityFailureType.INVALID_CHARACTER;
329            }
330            // If it's in the base character set, it's just an invalid character for this context
331            if (allowedChars.get(ch)) {
332                return UrlSecurityFailureType.INVALID_CHARACTER;
333            }
334            return UrlSecurityFailureType.CONTROL_CHARACTERS;
335        }
336
337        // All other invalid characters (including high-bit and Unicode)
338        return UrlSecurityFailureType.INVALID_CHARACTER;
339    }
340
341    /**
342     * Determines if the current validation type supports Unicode characters beyond 255.
343     * URL paths and parameter validation are restricted to ASCII per RFC 3986,
344     * while body and header content can contain Unicode when allowExtendedAscii is enabled.
345     *
346     * Note: This method works in conjunction with allowExtendedAscii flag:
347     * - For URLs/parameters: Always returns false (ASCII-only per RFC)
348     * - For headers/body: Returns true, allowing Unicode when allowExtendedAscii is enabled
349     */
350    private boolean supportsUnicodeCharacters() {
351        return switch (validationType) {
352            case BODY -> true;  // Body content can contain Unicode
353            case HEADER_VALUE -> true;  // Header values can contain Unicode in some cases
354            case URL_PATH, PARAMETER_NAME, PARAMETER_VALUE -> false;  // RFC 3986 is ASCII-based
355            case HEADER_NAME -> false;  // Header names should be ASCII
356            case COOKIE_NAME, COOKIE_VALUE -> false;  // Cookies should be ASCII-safe
357        };
358    }
359}