001/*
002 * Copyright © 2025 CUI-OpenSource-Software (info@cuioss.de)
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package de.cuioss.http.security.validation;
017
018import de.cuioss.http.security.config.SecurityConfiguration;
019import de.cuioss.http.security.core.HttpSecurityValidator;
020import de.cuioss.http.security.core.UrlSecurityFailureType;
021import de.cuioss.http.security.core.ValidationType;
022import de.cuioss.http.security.exceptions.UrlSecurityException;
023import org.jspecify.annotations.Nullable;
024
025import java.util.ArrayList;
026import java.util.List;
027import java.util.Optional;
028import java.util.function.Predicate;
029import java.util.regex.Pattern;
030
031/**
032 * Path normalization validation stage with security checks.
033 *
034 * <p>This stage performs RFC 3986 Section 5.2.4 path normalization to resolve
035 * relative path segments (. and ..) while detecting and preventing path traversal
036 * attacks. The stage processes paths through multiple security layers:</p>
037 *
038 * <ol>
039 *   <li><strong>Segment Parsing</strong> - Splits path into segments for processing</li>
040 *   <li><strong>Normalization</strong> - Resolves . and .. segments according to RFC 3986</li>
041 *   <li><strong>Security Validation</strong> - Detects remaining traversal attempts</li>
042 *   <li><strong>Root Escape Detection</strong> - Prevents escaping application root</li>
043 * </ol>
044 *
045 * <h3>Design Principles</h3>
046 * <ul>
047 *   <li><strong>RFC Compliance</strong> - Follows RFC 3986 path normalization rules</li>
048 *   <li><strong>Security First</strong> - Detects attacks through normalization analysis</li>
049 *   <li><strong>DoS Protection</strong> - Prevents excessive nesting and recursion attacks</li>
050 *   <li><strong>Thread Safety</strong> - Safe for concurrent use across multiple threads</li>
051 * </ul>
052 *
053 * <h3>Security Validations</h3>
054 * <ul>
055 *   <li><strong>Path Traversal</strong> - Detects ../ patterns that remain after normalization</li>
056 *   <li><strong>Root Escape</strong> - Prevents paths from escaping the application root</li>
057 *   <li><strong>Excessive Nesting</strong> - Limits path depth to prevent resource exhaustion</li>
058 *   <li><strong>Malicious Patterns</strong> - Identifies suspicious path construction</li>
059 * </ul>
060 *
061 * <h3>Usage Examples</h3>
062 * <pre>
063 * // Create normalization stage
064 * SecurityConfiguration config = SecurityConfiguration.defaults();
065 * NormalizationStage normalizer = new NormalizationStage(config, ValidationType.URL_PATH);
066 *
067 * // Normalize legitimate path
068 * String normalized = normalizer.validate("/api/users/./123/../456");
069 * // Returns: "/api/users/456"
070 *
071 * // Detect path traversal attack
072 * try {
073 *     normalizer.validate("/api/../../etc/passwd");
074 *     // Throws UrlSecurityException with DIRECTORY_ESCAPE_ATTEMPT
075 * } catch (UrlSecurityException e) {
076 *     logger.warn("Path traversal blocked: {}", e.getFailureType());
077 * }
078 *
079 * // Detect excessive nesting attack
080 * try {
081 *     normalizer.validate("/a/../b/../c/../d/../e/../f/../g/../h/../i/../j/../k/../l/../m/../n/../o/../p/../q/../r/../s/../t");
082 *     // Throws UrlSecurityException with EXCESSIVE_NESTING
083 * } catch (UrlSecurityException e) {
084 *     logger.warn("DoS attack blocked: {}", e.getFailureType());
085 * }
086 * </pre>
087 *
088 * <h3>Performance Characteristics</h3>
089 * <ul>
090 *   <li>O(n) time complexity where n is the number of path segments</li>
091 *   <li>Single pass through path segments with early termination</li>
092 *   <li>Minimal memory allocation - reuses StringBuilder</li>
093 *   <li>DoS protection through segment counting</li>
094 * </ul>
095 *
096 * <h3>RFC 3986 Compliance</h3>
097 * <p>This implementation follows RFC 3986 Section 5.2.4 "Remove Dot Segments":</p>
098 * <ul>
099 *   <li>Single dot segments (.) are removed</li>
100 *   <li>Double dot segments (..) remove the previous segment</li>
101 *   <li>Trailing slashes are preserved</li>
102 *   <li>Leading slashes are preserved</li>
103 * </ul>
104 * <p>
105 * Implements: Task V2 from HTTP verification specification
106 *
107 * @param config         Security configuration controlling validation behavior.
108 * @param validationType Type of validation being performed (URL_PATH, PARAMETER_NAME, etc.).
109 * @see HttpSecurityValidator
110 * @see SecurityConfiguration
111 * @see ValidationType
112 * @since 1.0
113 */
114public record NormalizationStage(SecurityConfiguration config,
115ValidationType validationType) implements HttpSecurityValidator {
116
117    /**
118     * Maximum number of path segments to prevent DoS attacks.
119     * This limit prevents excessive processing time from deeply nested paths.
120     */
121    private static final int MAX_PATH_SEGMENTS = 1000;
122
123    /**
124     * Maximum directory depth to prevent excessive nesting attacks.
125     * Based on common filesystem and application server limits.
126     */
127    private static final int MAX_DIRECTORY_DEPTH = 100;
128
129    /**
130     * Precompiled pattern to detect URLs with protocol schemes.
131     * Matches RFC 3986 scheme format: scheme://authority/path
132     * Used to prevent normalization of protocol portions in URLs.
133     */
134    private static final Pattern URL_WITH_PROTOCOL_PATTERN = Pattern.compile("^[a-zA-Z][a-zA-Z0-9+.-]*://.*");
135
136    /**
137     * Pattern to detect suspicious single-component directory traversal.
138     * Matches patterns like "valid/../segment" where a single path segment (not starting with ..)
139     * precedes "../" and is followed by another path segment.
140     * Updated to handle RFC 3986 allowed characters including dots, tildes, and sub-delimiters.
141     */
142    static final Pattern SINGLE_COMPONENT_TRAVERSAL_PATTERN = Pattern.compile("^(?!\\.\\./)[^/\\\\]+/\\.\\./[^/\\\\]+$");
143
144    /**
145     * Pattern to detect multiple consecutive dots with path separators.
146     * Matches patterns like ".../" or "...\\" which could be traversal bypass attempts.
147     * Does not match legitimate filenames like "file...txt".
148     * Uses .find() with simple pattern to prevent ReDoS attacks.
149     */
150    static final Pattern MULTIPLE_DOTS_WITH_SEPARATOR_PATTERN = Pattern.compile("\\.{3,}[/\\\\]");
151
152    /**
153     * Pattern for splitting paths on forward slash or backslash separators.
154     * Used for parsing path segments during traversal detection.
155     */
156    static final Pattern PATH_SEPARATOR_PATTERN = Pattern.compile("[/\\\\]");
157
158    /**
159     * Pattern to detect paths ending with "/..".
160     * Matches paths that end with forward slash followed by double dot.
161     */
162    static final Pattern ENDS_WITH_SLASH_DOTDOT_PATTERN = Pattern.compile(".*/\\.\\.$");
163
164    /**
165     * Pattern to detect paths starting with "../".
166     * Matches paths that begin with double dot followed by forward slash.
167     */
168    static final Pattern STARTS_WITH_DOTDOT_SLASH_PATTERN = Pattern.compile("^\\.\\./.*");
169
170    /**
171     * Pattern to detect paths starting with "..\\".
172     * Matches paths that begin with double dot followed by backslash.
173     */
174    static final Pattern STARTS_WITH_DOTDOT_BACKSLASH_PATTERN = Pattern.compile("^\\.\\.\\\\..*");
175
176    /**
177     * Pattern to detect internal slash-dotdot patterns.
178     * Matches "/" followed by ".." only when it's a directory traversal (followed by "/" or end of string).
179     * This avoids false positives for filenames starting with ".." like "a/..c"
180     * Optimized for .find() usage without unnecessary .* wrappers.
181     */
182    static final Pattern CONTAINS_SLASH_DOTDOT_PATTERN = Pattern.compile("/\\.\\.(?:/|$)");
183
184    /**
185     * Pattern to detect internal dotdot-backslash patterns.
186     * Matches ".." followed by "\\" anywhere in the path.
187     * Used in conjunction with STARTS_WITH_DOTDOT_BACKSLASH_PATTERN to exclude initial "..\\".
188     * Optimized for .find() usage without unnecessary .* wrappers.
189     */
190    static final Pattern CONTAINS_DOTDOT_BACKSLASH_PATTERN = Pattern.compile("\\.\\.\\\\");
191
192
193    /**
194     * Validates and normalizes a path with comprehensive security checks.
195     *
196     * <p>Processing stages:</p>
197     * <ol>
198     *   <li>Input validation - handles null/empty inputs</li>
199     *   <li>Path segment parsing - splits on directory separators</li>
200     *   <li>RFC 3986 normalization - resolves . and .. segments</li>
201     *   <li>Security validation - detects remaining attack patterns</li>
202     * </ol>
203     *
204     * @param value The input path to validate and normalize
205     * @return The validated and normalized path wrapped in Optional, or Optional.empty() if input was null
206     * @throws UrlSecurityException if any security violations are detected:
207     *                              <ul>
208     *                                <li>EXCESSIVE_NESTING - if path contains too many segments or depth</li>
209     *                                <li>PATH_TRAVERSAL_DETECTED - if ../ patterns remain after normalization</li>
210     *                                <li>DIRECTORY_ESCAPE_ATTEMPT - if normalized path tries to escape root</li>
211     *                              </ul>
212     */
213    @Override
214    public Optional<String> validate(@Nullable String value) throws UrlSecurityException {
215        if (value == null) {
216            return Optional.empty();
217        }
218        if (value.isEmpty()) {
219            return Optional.of(value);
220        }
221
222        // Save original for comparison and error reporting
223        @SuppressWarnings("UnnecessaryLocalVariable") // Used in exception handling below
224        String original = value;
225
226        // LAYER 1: Semantic Intent Validation - Detect directory traversal patterns BEFORE normalization
227        // This follows OWASP/CISA best practices for defense in depth
228        if (containsDirectoryTraversalIntent(original)) {
229            throw UrlSecurityException.builder()
230                    .failureType(UrlSecurityFailureType.PATH_TRAVERSAL_DETECTED)
231                    .validationType(validationType)
232                    .originalInput(original)
233                    .detail("Directory traversal pattern detected in input")
234                    .build();
235        }
236
237        // Normalize URI components (resolve . and .. in path segments)
238        String normalized = normalizeUriComponent(value);
239
240        // Check if path escapes root after normalization (check first for proper precedence)
241        if (escapesRoot(normalized)) {
242            throw UrlSecurityException.builder()
243                    .failureType(UrlSecurityFailureType.DIRECTORY_ESCAPE_ATTEMPT)
244                    .validationType(validationType)
245                    .originalInput(original)
246                    .sanitizedInput(normalized)
247                    .detail("Path attempts to escape root directory")
248                    .build();
249        }
250
251        // LAYER 2: Syntactic Validation - Check for remaining traversal patterns after normalization
252        if (containsInternalPathTraversal(normalized)) {
253            throw UrlSecurityException.builder()
254                    .failureType(UrlSecurityFailureType.PATH_TRAVERSAL_DETECTED)
255                    .validationType(validationType)
256                    .originalInput(original)
257                    .sanitizedInput(normalized)
258                    .detail("Path normalization revealed traversal attempt")
259                    .build();
260        }
261
262        return Optional.of(normalized);
263    }
264
265    /**
266     * Normalizes URI components according to RFC 3986 with DoS protection.
267     *
268     * <p>This method implements RFC 3986 Section 5.2.4 "Remove Dot Segments" algorithm
269     * for path components, while preserving complete URIs with protocol schemes.
270     * Includes additional security measures to prevent resource exhaustion attacks.</p>
271     *
272     * @param uriComponent The URI component to normalize (path segment or complete URI)
273     * @return The normalized URI component
274     * @throws UrlSecurityException if processing limits are exceeded
275     */
276    private String normalizeUriComponent(String uriComponent) {
277        // Check if this is a complete URI with protocol - don't normalize protocol portion
278        if (URL_WITH_PROTOCOL_PATTERN.matcher(uriComponent).matches()) {
279            return uriComponent;
280        }
281
282        // RFC 3986 path segment normalization with recursion protection
283        String[] segments = uriComponent.split("/", -1);
284        List<String> outputSegments = new ArrayList<>();
285        boolean isAbsolute = uriComponent.startsWith("/");
286
287        // Validate segment count
288        validateSegmentCount(segments.length, uriComponent);
289
290        // Process each segment
291        for (String segment : segments) {
292            processPathSegment(segment, outputSegments, isAbsolute, uriComponent);
293        }
294
295        // Build and return normalized path
296        return buildNormalizedPath(outputSegments, isAbsolute, uriComponent);
297    }
298
299    /**
300     * Validates that the segment count does not exceed security limits.
301     *
302     * @param segmentCount Number of path segments
303     * @param originalInput Original input for error reporting
304     * @throws UrlSecurityException if segment count exceeds limits
305     */
306    private void validateSegmentCount(int segmentCount, String originalInput) {
307        if (segmentCount > MAX_PATH_SEGMENTS) {
308            throw UrlSecurityException.builder()
309                    .failureType(UrlSecurityFailureType.EXCESSIVE_NESTING)
310                    .validationType(validationType)
311                    .originalInput(originalInput)
312                    .detail("Path contains too many segments: " + segmentCount + " (max: " + MAX_PATH_SEGMENTS + ")")
313                    .build();
314        }
315    }
316
317    /**
318     * Processes a single path segment according to RFC 3986 normalization rules.
319     *
320     * @param segment Path segment to process
321     * @param outputSegments Current output segments list
322     * @param isAbsolute Whether this is an absolute path
323     * @param originalInput Original input for error reporting
324     * @throws UrlSecurityException if depth limits are exceeded
325     */
326    private void processPathSegment(String segment, List<String> outputSegments, boolean isAbsolute, String originalInput) {
327        switch (segment) {
328            case "." -> {
329                // Current directory - skip (RFC 3986 Section 5.2.4)
330            }
331            case ".." -> {
332                // Parent directory
333                if (!outputSegments.isEmpty() && !"..".equals(outputSegments.getLast())) {
334                    // Can resolve this .. by removing the previous segment
335                    outputSegments.removeLast();
336                } else if (!isAbsolute) {
337                    // For relative paths, keep .. if we can't resolve it
338                    outputSegments.add("..");
339                }
340                // For absolute paths, .. at root is ignored
341            }
342            case "" -> {
343                // Empty segment - only preserve for leading slash or trailing slash
344                // Skip empty segments from double slashes in the middle
345            }
346            default -> {
347                // Normal segment
348                outputSegments.add(segment);
349                validateDirectoryDepth(outputSegments.size(), originalInput);
350            }
351        }
352    }
353
354    /**
355     * Validates that directory depth does not exceed security limits.
356     *
357     * @param currentDepth Current directory depth
358     * @param originalInput Original input for error reporting
359     * @throws UrlSecurityException if depth exceeds limits
360     */
361    private void validateDirectoryDepth(int currentDepth, String originalInput) {
362        if (currentDepth > MAX_DIRECTORY_DEPTH) {
363            throw UrlSecurityException.builder()
364                    .failureType(UrlSecurityFailureType.EXCESSIVE_NESTING)
365                    .validationType(validationType)
366                    .originalInput(originalInput)
367                    .detail("Path depth " + currentDepth + " exceeds maximum " + MAX_DIRECTORY_DEPTH)
368                    .build();
369        }
370    }
371
372    /**
373     * Builds the normalized path string from processed segments.
374     *
375     * @param outputSegments Processed path segments
376     * @param isAbsolute Whether this is an absolute path
377     * @param originalInput Original input for trailing slash preservation
378     * @return Normalized path string
379     */
380    private String buildNormalizedPath(List<String> outputSegments, boolean isAbsolute, String originalInput) {
381        StringBuilder result = new StringBuilder();
382
383        // Add leading slash for absolute paths
384        if (isAbsolute) {
385            result.append("/");
386        }
387
388        // Add segments
389        for (int i = 0; i < outputSegments.size(); i++) {
390            if (i > 0) {
391                result.append("/");
392            }
393            result.append(outputSegments.get(i));
394        }
395
396        // Preserve trailing slash if present and we have content, or for root path
397        if (originalInput.endsWith("/") && !result.toString().endsWith("/") && (!outputSegments.isEmpty() || isAbsolute)) {
398            result.append("/");
399        }
400
401        return result.toString();
402    }
403
404    /**
405     * Detects directory traversal intent patterns in the original input before normalization.
406     *
407     * <p>This method implements semantic validation following OWASP/CISA best practices
408     * for defense in depth. It identifies patterns that indicate malicious directory
409     * navigation intent, such as "valid/../segment", regardless of normalization outcome.</p>
410     *
411     * <p>Based on research analysis of CVEs:
412     * <a href="https://nvd.nist.gov/vuln/detail/CVE-2021-41773">CVE-2021-41773</a>,
413     * <a href="https://nvd.nist.gov/vuln/detail/CVE-2021-42013">CVE-2021-42013</a>,
414     * <a href="https://nvd.nist.gov/vuln/detail/CVE-2024-38819">CVE-2024-38819</a>
415     * and industry best practices, patterns like "directory/../target" represent attack
416     * fingerprints that should be rejected semantically before syntactic processing.</p>
417     *
418     * @param input The original input path to analyze for traversal intent
419     * @return true if the input contains directory traversal patterns indicating malicious intent
420     */
421    private boolean containsDirectoryTraversalIntent(String input) {
422        // Based on research, focus on specific attack patterns while allowing legitimate RFC 3986 navigation
423
424        // Pattern 1: Suspicious single-component traversal patterns
425        // This targets cases like "valid/../segment" where a single word precedes "../"
426        // but allows legitimate multi-level paths like "/api/users/../admin"
427        if (SINGLE_COMPONENT_TRAVERSAL_PATTERN.matcher(input).matches()) {
428            return true;
429        }
430
431        // Pattern 2: Encoded traversal attempts (based on Apache CVE research)
432        // Covers URL encoded variants like "..%2e/" or "%2e%2e/"
433        if (input.contains("..%") || input.contains("%2e%2e") || input.contains("%2E%2E")) {
434            return true;
435        }
436
437        // Pattern 3: Multiple consecutive dots with separators (traversal bypass attempts)
438        // Covers ".../" but NOT "file...txt"
439        if (MULTIPLE_DOTS_WITH_SEPARATOR_PATTERN.matcher(input).find()) {
440            return true;
441        }
442
443        // Pattern 4: Windows-style backslash traversal (but not if it starts with ..)
444        // Patterns starting with .. should be handled by escapesRoot check
445        return CONTAINS_DOTDOT_BACKSLASH_PATTERN.matcher(input).find() &&
446                !STARTS_WITH_DOTDOT_BACKSLASH_PATTERN.matcher(input).matches();
447    }
448
449    /**
450     * Checks if the normalized path contains internal path traversal patterns.
451     *
452     * <p>After proper normalization, there should be no remaining .. segments
453     * except at the beginning for relative paths (which is handled by escapesRoot).
454     * This method performs comprehensive checks for any remaining traversal patterns
455     * that could indicate incomplete normalization or sophisticated attacks.</p>
456     *
457     * @param path The normalized path to check
458     * @return true if path contains internal traversal patterns
459     */
460    private boolean containsInternalPathTraversal(String path) {
461        // After normalization, check for .. segments that aren't at the start
462        if (CONTAINS_SLASH_DOTDOT_PATTERN.matcher(path).find()) {
463            return true;
464        }
465
466        // For backslash patterns, exclude those starting with ..\\ (handled by escapesRoot)
467        if (CONTAINS_DOTDOT_BACKSLASH_PATTERN.matcher(path).find() &&
468                !STARTS_WITH_DOTDOT_BACKSLASH_PATTERN.matcher(path).matches()) {
469            return true;
470        }
471
472        // Check for .. at end of path (without leading ../)
473        if (ENDS_WITH_SLASH_DOTDOT_PATTERN.matcher(path).matches() &&
474                !STARTS_WITH_DOTDOT_SLASH_PATTERN.matcher(path).matches()) {
475            return true;
476        }
477
478        // Check for standalone .. that isn't at the beginning
479        if ("..".equals(path)) {
480            return true;
481        }
482
483        // Additional security: check for any .. that appears as a complete path segment
484        // This catches cases where .. remains as directory navigation after normalization
485        // but excludes .. that appears embedded within filenames (fixing false positives)
486        if (path.contains("..")) {
487            // Check if .. appears as a complete path segment (separated by slashes)
488            String[] segments = PATH_SEPARATOR_PATTERN.split(path);
489            for (String segment : segments) {
490                if ("..".equals(segment)) {
491                    return true;
492                }
493            }
494        }
495
496        return false;
497    }
498
499    /**
500     * Checks if the normalized path attempts to escape the application root.
501     *
502     * <p>This check identifies paths that would navigate outside the intended
503     * directory structure after normalization.</p>
504     *
505     * @param path The normalized path to check
506     * @return true if path attempts to escape root
507     */
508    private boolean escapesRoot(String path) {
509        // Check if normalized path tries to escape root
510        return STARTS_WITH_DOTDOT_SLASH_PATTERN.matcher(path).matches() ||
511                STARTS_WITH_DOTDOT_BACKSLASH_PATTERN.matcher(path).matches();
512    }
513
514    /**
515     * Creates a conditional validator that only processes inputs matching the condition.
516     *
517     * @param condition The condition to test before validation
518     * @return A conditional HttpSecurityValidator that applies normalization conditionally
519     */
520    @Override
521    public HttpSecurityValidator when(Predicate<String> condition) {
522        return input -> {
523            if (input == null || !condition.test(input)) {
524                return Optional.ofNullable(input);
525            }
526            return validate(input);
527        };
528    }
529
530
531}