iri_string/normalize/
path.rs

1//! Path normalization.
2
3use core::fmt;
4use core::ops::{ControlFlow, Range};
5
6use crate::parser::str::{find_split_hole, rfind};
7use crate::spec::{Spec, UriSpec};
8
9use super::pct_case::PctCaseNormalized;
10use super::{Error, NormalizationMode, NormalizationOp};
11
12/// Path that is (possibly) not yet processed or being processed.
13#[derive(Debug, Clone, Copy)]
14pub(crate) enum Path<'a> {
15    /// The result. No more processing is needed.
16    Done(&'a str),
17    /// Not yet completely processed path.
18    NeedsProcessing(PathToNormalize<'a>),
19}
20
21/// Path that needs merge and/or dot segment removal.
22///
23/// This works like a queue with `pop_front` functionality.
24///
25/// # Invariants
26///
27/// * `back` should be `None` if `front` is empty.
28/// * `back` should be a non-empty string if `Some`.
29/// * `front` should ends with a slash if `back` is `Some`.
30///     + In other words, any path segments can exist in `front` or `back`
31///       but not both at a time.
32#[derive(Debug, Clone, Copy)]
33pub(crate) struct PathToNormalize<'a> {
34    /// Front half of the buffer.
35    front: &'a str,
36    /// Back half of the buffer.
37    back: Option<&'a str>,
38}
39
40impl<'a> PathToNormalize<'a> {
41    /// Creates a `PathToNormalize` from the given single path.
42    #[inline]
43    #[must_use]
44    pub(crate) fn from_single_path(path: &'a str) -> Self {
45        Self {
46            front: path,
47            back: None,
48        }
49    }
50
51    /// Creates a `PathToNormalize` from the given base and reference paths to be resolved.
52    #[must_use]
53    pub(crate) fn from_paths_to_be_resolved(base: &'a str, reference: &'a str) -> Self {
54        if reference.starts_with('/') {
55            return Self {
56                front: reference,
57                back: None,
58            };
59        }
60
61        match rfind(base.as_bytes(), b'/') {
62            Some(last_slash_pos) => {
63                let front = &base[..=last_slash_pos];
64                debug_assert!(front.ends_with('/'));
65                Self {
66                    front,
67                    back: Some(reference).filter(|s| !s.is_empty()),
68                }
69            }
70            None => Self {
71                front: reference,
72                back: None,
73            },
74        }
75    }
76
77    /// Returns true if the path is empty string.
78    #[inline]
79    #[must_use]
80    fn is_empty(&self) -> bool {
81        debug_assert!(
82            !(self.front.is_empty() && self.back.is_some()),
83            "the front buffer must have been refilled"
84        );
85        self.front.is_empty()
86    }
87
88    /// Returns the length of the path.
89    #[inline]
90    #[must_use]
91    pub(super) fn len(&self) -> usize {
92        self.front.len() + self.back.map_or(0, |s| s.len())
93    }
94
95    /// Returns `true` if the path starts with a slash.
96    #[must_use]
97    fn starts_with_slash(&self) -> bool {
98        self.front.starts_with('/')
99    }
100
101    /// Returns a byte at the position.
102    ///
103    /// Returns `None` if the index is out of range.
104    // This characteristic is necessary since `i` can be the index after the
105    // last byte (i.e., the "end" index of the range).
106    #[must_use]
107    fn byte_at(&self, i: usize) -> Option<u8> {
108        match i.checked_sub(self.front.len()) {
109            None => Some(self.front.as_bytes()[i]),
110            Some(back_i) => self
111                .back
112                .and_then(|back| back.as_bytes().get(back_i).copied()),
113        }
114    }
115
116    /// Returns the position of the next slash after (including) `start`.
117    #[inline]
118    #[must_use]
119    fn find_next_slash_from(&self, start: usize) -> Option<usize> {
120        if let Some(back_i) = start.checked_sub(self.front.len()) {
121            // Search the back buffer.
122            return self.back?[back_i..].find('/').map(|pos| start + pos);
123        }
124        match self.front[start..].find('/') {
125            Some(pos) => Some(start + pos),
126            None => {
127                debug_assert!(
128                    self.back.is_none(),
129                    "front must ends with a slash if back is `Some`"
130                );
131                None
132            }
133        }
134    }
135
136    /// Removes the `len` characters from the beginning of `self`.
137    fn remove_start(&mut self, len: usize) {
138        if let Some(back_len) = len.checked_sub(self.front.len()) {
139            self.front = self
140                .back
141                .take()
142                .and_then(|s| s.get(back_len..))
143                .unwrap_or_default();
144            return;
145        }
146        self.front = &self.front[len..];
147    }
148
149    /// Removes the prefix that are ignorable on normalization.
150    //
151    // Skips the prefix dot segments without leading slashes (such as `./`,
152    // `../`, and `../.././`). This is necessary because such segments should be
153    // removed along with the FOLLOWING slashes, not leading slashes.
154    fn remove_ignorable_prefix(&mut self) {
155        while let Some(seg) = PathSegmentsIter::new(self).next() {
156            if seg.has_leading_slash {
157                // The first segment starting with a slash is not target.
158                break;
159            }
160            match seg.kind(self) {
161                SegmentKind::Dot | SegmentKind::DotDot => {
162                    // Attempt to skip the following slash by `+ 1`.
163                    let skip = self.front.len().min(seg.name_range.end + 1);
164                    self.remove_start(skip);
165                }
166                SegmentKind::Normal => break,
167            }
168        }
169    }
170}
171
172impl PathToNormalize<'_> {
173    /// Writes the normalized path.
174    pub(crate) fn fmt_write_normalize<S: Spec, W: fmt::Write>(
175        mut self,
176        f: &mut W,
177        op: NormalizationOp,
178        authority_is_present: bool,
179    ) -> fmt::Result {
180        if self.is_empty() {
181            return Ok(());
182        }
183
184        if (op.mode == NormalizationMode::PreserveAuthoritylessRelativePath)
185            && !authority_is_present
186            && !self.starts_with_slash()
187        {
188            // Treat the path as "opaque", i.e. do not apply dot segments removal.
189            // See <https://github.com/lo48576/iri-string/issues/29>.
190            debug_assert!(
191                op.mode.case_pct_normalization(),
192                "case/pct normalization should still be applied"
193            );
194            write!(f, "{}", PctCaseNormalized::<S>::new(self.front))?;
195            if let Some(back) = self.back {
196                write!(f, "{}", PctCaseNormalized::<S>::new(back))?;
197            }
198            return Ok(());
199        }
200
201        // Skip the prefix dot segments without leading slashes (such as `./`,
202        // `../`, and `../.././`). This is necessary because such segments should
203        // be removed along with the FOLLOWING slashes, not leading slashes.
204        self.remove_ignorable_prefix();
205        if self.is_empty() {
206            // The path consists of only `/.`s and `/..`s.
207            // In this case, if the authority component is present, the result
208            // should be `/`, not empty.
209            if authority_is_present {
210                f.write_char('/')?;
211            }
212            return Ok(());
213        }
214
215        // None: No segments are written yet.
216        // Some(false): Something other than `/` is already written as the path.
217        // Some(true): Only a `/` is written as the path.
218        let mut only_a_slash_is_written = None;
219        // `true` if the path may have not yet handled dot segments.
220        let mut may_have_not_yet_resolved_dot_segments = true;
221        // Scan for the dot segments and resolve them.
222        while !self.is_empty() && may_have_not_yet_resolved_dot_segments {
223            /// The size of the queue to track the path segments.
224            ///
225            /// This should be nonzero.
226            const QUEUE_SIZE: usize = 8;
227
228            let ret = self.resolve_and_write_path_prefixes::<S, _, QUEUE_SIZE>(
229                &mut only_a_slash_is_written,
230                &mut may_have_not_yet_resolved_dot_segments,
231                authority_is_present,
232                f,
233                op,
234            )?;
235            if matches!(ret, ControlFlow::Break(_)) {
236                return Ok(());
237            }
238        }
239
240        if !self.is_empty() {
241            if !authority_is_present {
242                // Note that `self` has no dot segments anymore. In order to handle
243                // authority-less relative path correctly, it would be enough to
244                // check if the path to be written starts with `//` or not.
245                match only_a_slash_is_written {
246                    None => {
247                        // TODO: `Option::is_some_and()` is stabilized since Rust 1.70.0.
248                        if ((self.front == "/")
249                            && self.back.map_or(false, |back| back.starts_with('/')))
250                            || self.front.starts_with("//")
251                        {
252                            f.write_str("/.//")?;
253                            self.remove_start("//".len());
254                        }
255                    }
256                    Some(true) => {
257                        if self.starts_with_slash() {
258                            f.write_str(".//")?;
259                            self.remove_start("/".len());
260                        }
261                    }
262                    Some(false) => {}
263                }
264            }
265
266            // Emit the path at once. No need to split into segments since it
267            // has no dot segments.
268            if op.mode.case_pct_normalization() {
269                write!(f, "{}", PctCaseNormalized::<S>::new(self.front))?;
270                if let Some(back) = &self.back {
271                    write!(f, "{}", PctCaseNormalized::<S>::new(back))?;
272                }
273            } else {
274                f.write_str(self.front)?;
275                if let Some(back) = &self.back {
276                    f.write_str(back)?;
277                }
278            }
279        }
280
281        Ok(())
282    }
283
284    /// Resolves the path and writes the prefixes as much as confirmed.
285    fn resolve_and_write_path_prefixes<S, W, const QUEUE_SIZE: usize>(
286        &mut self,
287        only_a_slash_is_written: &mut Option<bool>,
288        may_have_not_yet_resolved_dot_segments: &mut bool,
289        authority_is_present: bool,
290        f: &mut W,
291        op: NormalizationOp,
292    ) -> Result<ControlFlow<()>, fmt::Error>
293    where
294        S: Spec,
295        W: fmt::Write,
296    {
297        assert!(!self.is_empty());
298        assert!(*may_have_not_yet_resolved_dot_segments);
299
300        // Skip the dot segments at the head.
301        {
302            let skipped_len = PathSegmentsIter::new(self)
303                .map_while(|seg| match seg.kind(self) {
304                    SegmentKind::Dot | SegmentKind::DotDot => {
305                        debug_assert!(
306                            seg.has_leading_slash,
307                            "dot segments without a leading slash have already been skipped"
308                        );
309                        Some(seg.name_range.end)
310                    }
311                    SegmentKind::Normal => None,
312                })
313                .last()
314                .unwrap_or(0);
315            self.remove_start(skipped_len);
316            if self.is_empty() {
317                // Finished with a dot segment.
318                // The last `/.` or `/..` should be replaced to `/`.
319                if !authority_is_present && (*only_a_slash_is_written == Some(true)) {
320                    // Insert a dot segment to break the prefix `//`.
321                    // Without this, the path starts with `//` and it may
322                    // be confused with the prefix of an authority.
323                    f.write_str(".//")?;
324                } else {
325                    f.write_char('/')?;
326                }
327                return Ok(ControlFlow::Break(()));
328            }
329        }
330
331        // Find decisive path segments from higher to lower.
332        let (segname_queue, first_segment_has_leading_slash, resolved_end): (
333            [Option<&'_ str>; QUEUE_SIZE],
334            bool,
335            usize,
336        ) = {
337            let mut segname_queue: [Option<&'_ str>; QUEUE_SIZE] = [Default::default(); QUEUE_SIZE];
338            let mut first_segment_has_leading_slash = false;
339            // The end byte position of the prefix that is being written in this
340            // function call. The part before this position is ignorable after
341            // the next iteration.
342            let mut resolved_end = 0;
343
344            let mut level: usize = 0;
345            for seg in PathSegmentsIter::new(self) {
346                let kind = seg.kind(self);
347                match kind {
348                    SegmentKind::Dot => {
349                        *may_have_not_yet_resolved_dot_segments = true;
350                    }
351                    SegmentKind::DotDot => {
352                        level = level.saturating_sub(1);
353                        *may_have_not_yet_resolved_dot_segments = true;
354                        if let Some(dest) = segname_queue.get_mut(level) {
355                            *dest = None;
356                        }
357                    }
358                    SegmentKind::Normal => {
359                        if let Some(dest) = segname_queue.get_mut(level) {
360                            *dest = Some(seg.segment(self));
361                            *may_have_not_yet_resolved_dot_segments = false;
362                            resolved_end = seg.name_range.end;
363                            if level == 0 {
364                                first_segment_has_leading_slash = seg.has_leading_slash;
365                            }
366                        }
367                        level += 1;
368                    }
369                }
370            }
371
372            (segname_queue, first_segment_has_leading_slash, resolved_end)
373        };
374
375        // At this point, `segname_queue` has the prefix of the resolved path.
376        for segname in segname_queue.iter().flatten() {
377            PathToNormalize::emit_segment::<S, _>(
378                f,
379                only_a_slash_is_written,
380                first_segment_has_leading_slash,
381                segname,
382                authority_is_present,
383                op,
384            )?;
385        }
386
387        // Trim the processed prefix.
388        self.remove_start(resolved_end);
389
390        Ok(ControlFlow::Continue(()))
391    }
392
393    /// Emits a non-dot segment and update the current state.
394    //
395    // `first_segment_has_leading_slash` can be any value if the segment is not the first one.
396    fn emit_segment<S: Spec, W: fmt::Write>(
397        f: &mut W,
398        only_a_slash_is_written: &mut Option<bool>,
399        first_segment_has_leading_slash: bool,
400        segname: &str,
401        authority_is_present: bool,
402        op: NormalizationOp,
403    ) -> fmt::Result {
404        // Omit the leading slash of the segment only if the segment is
405        // the first one and marked as not having a leading slash.
406        match *only_a_slash_is_written {
407            None => {
408                // First segment.
409                if first_segment_has_leading_slash {
410                    f.write_char('/')?;
411                }
412                *only_a_slash_is_written =
413                    Some(first_segment_has_leading_slash && segname.is_empty());
414            }
415            Some(only_a_slash) => {
416                if only_a_slash && !authority_is_present {
417                    // Apply serialization like WHATWG URL Standard.
418                    // This prevents `<scheme=foo>:<path=//bar>` from written as `foo://bar`,
419                    // which is interpreted as `<scheme=foo>://<authority=bar>`, which is
420                    // semantically different from the original IRI. Prepending `./`, the
421                    // serialization result would be `foo:/.//bar`, which is semantically
422                    // equivalent to the original.
423                    f.write_str("./")?;
424                    *only_a_slash_is_written = Some(false);
425                }
426                f.write_char('/')?;
427            }
428        }
429
430        // Write the segment name.
431        if op.mode.case_pct_normalization() {
432            write!(f, "{}", PctCaseNormalized::<S>::new(segname))
433        } else {
434            f.write_str(segname)
435        }
436    }
437
438    /// Checks if the path is normalizable by RFC 3986 algorithm when the authority is absent.
439    ///
440    /// Returns `Ok(())` when normalizable, returns `Err(_)` if not.
441    pub(crate) fn ensure_rfc3986_normalizable_with_authority_absent(&self) -> Result<(), Error> {
442        /// A sink to get the prefix of the input.
443        #[derive(Default)]
444        struct PrefixRetriever {
445            /// The buffer to remember the prefix of the input.
446            buf: [u8; 3],
447            /// The next write position in the buffer.
448            cursor: usize,
449        }
450        impl PrefixRetriever {
451            /// Returns the read prefix data.
452            #[inline]
453            #[must_use]
454            fn as_bytes(&self) -> &[u8] {
455                &self.buf[..self.cursor]
456            }
457        }
458        impl fmt::Write for PrefixRetriever {
459            fn write_str(&mut self, s: &str) -> fmt::Result {
460                if !s.is_empty() && (self.cursor >= self.buf.len()) {
461                    // Enough bytes are read.
462                    return Err(fmt::Error);
463                }
464                self.buf[self.cursor..]
465                    .iter_mut()
466                    .zip(s.bytes())
467                    .for_each(|(dest, src)| *dest = src);
468                self.cursor = self.cursor.saturating_add(s.len()).min(self.buf.len());
469                Ok(())
470            }
471        }
472
473        let mut prefix = PrefixRetriever::default();
474        // The failure of this write indicates more than 3 characters are read.
475        // This is safe to ignore since the check needs only 3 characters.
476        let _ = self.fmt_write_normalize::<UriSpec, _>(
477            &mut prefix,
478            NormalizationOp {
479                mode: NormalizationMode::None,
480            },
481            // Assume the authority is absent.
482            false,
483        );
484
485        if prefix.as_bytes() == b"/./" {
486            Err(Error::new())
487        } else {
488            Ok(())
489        }
490    }
491}
492
493/// Characteristic of a path.
494#[derive(Debug, Clone, Copy)]
495pub(crate) enum PathCharacteristic {
496    /// Absolute path, not special.
497    CommonAbsolute,
498    /// Absolute path, not special.
499    CommonRelative,
500    /// The first path segment of the relative path has one or more colon characters.
501    RelativeFirstSegmentHasColon,
502    /// The path starts with the double slash.
503    StartsWithDoubleSlash,
504}
505
506impl PathCharacteristic {
507    /// Returns true if the path is absolute.
508    #[inline]
509    #[must_use]
510    pub(crate) fn is_absolute(self) -> bool {
511        matches!(self, Self::CommonAbsolute | Self::StartsWithDoubleSlash)
512    }
513
514    /// Returns the characteristic of the path.
515    pub(crate) fn from_path_to_display<S: Spec>(
516        path: &PathToNormalize<'_>,
517        op: NormalizationOp,
518        authority_is_present: bool,
519    ) -> Self {
520        /// Dummy writer to get necessary values.
521        #[derive(Default, Clone, Copy)]
522        struct Writer {
523            /// Result.
524            result: Option<PathCharacteristic>,
525            /// Whether the normalized path is absolute.
526            is_absolute: Option<bool>,
527        }
528        impl fmt::Write for Writer {
529            fn write_str(&mut self, mut s: &str) -> fmt::Result {
530                if self.result.is_some() {
531                    // Nothing more to do.
532                    return Err(fmt::Error);
533                }
534                while !s.is_empty() {
535                    if self.is_absolute.is_none() {
536                        // The first input.
537                        match s.strip_prefix('/') {
538                            Some(rest) => {
539                                self.is_absolute = Some(true);
540                                s = rest;
541                            }
542                            None => {
543                                self.is_absolute = Some(false);
544                            }
545                        }
546                        continue;
547                    }
548                    if self.is_absolute == Some(true) {
549                        let result = if s.starts_with('/') {
550                            PathCharacteristic::StartsWithDoubleSlash
551                        } else {
552                            PathCharacteristic::CommonAbsolute
553                        };
554                        self.result = Some(result);
555                        return Err(fmt::Error);
556                    }
557                    // Processing the first segment of the relative path.
558                    match find_split_hole(s, b'/') {
559                        Some((first_seg, _rest)) => {
560                            let result = if first_seg.contains(':') {
561                                PathCharacteristic::RelativeFirstSegmentHasColon
562                            } else {
563                                PathCharacteristic::CommonRelative
564                            };
565                            self.result = Some(result);
566                            return Err(fmt::Error);
567                        }
568                        None => {
569                            // `s` might not be the complete first segment.
570                            if s.contains(':') {
571                                self.result =
572                                    Some(PathCharacteristic::RelativeFirstSegmentHasColon);
573                                return Err(fmt::Error);
574                            }
575                            break;
576                        }
577                    }
578                }
579                Ok(())
580            }
581        }
582
583        let mut writer = Writer::default();
584        match path.fmt_write_normalize::<S, _>(&mut writer, op, authority_is_present) {
585            // Empty path.
586            Ok(_) => PathCharacteristic::CommonRelative,
587            Err(_) => writer
588                .result
589                .expect("the formatting quits early by `Err` when the check is done"),
590        }
591    }
592}
593
594/// Path segment kind.
595#[derive(Debug, Clone, Copy, PartialEq, Eq)]
596enum SegmentKind {
597    /// `.` or the equivalents.
598    Dot,
599    /// `..` or the equivalents.
600    DotDot,
601    /// Other normal (not special) segments.
602    Normal,
603}
604
605impl SegmentKind {
606    /// Creates a new `SegmentKind` from the given segment name.
607    #[must_use]
608    fn from_segment(s: &str) -> Self {
609        if !(1..=6).contains(&s.len()) {
610            // The length of a dot segment can only be 1, 2, 3, 4, or 6.
611            return SegmentKind::Normal;
612        }
613        if !(s.starts_with('.') || s.starts_with('%')) {
614            return SegmentKind::Normal;
615        }
616        match s {
617            "." | "%2E" | "%2e" => SegmentKind::Dot,
618            ".." | ".%2E" | ".%2e" | "%2E." | "%2E%2E" | "%2E%2e" | "%2e." | "%2e%2E"
619            | "%2e%2e" => SegmentKind::DotDot,
620            _ => SegmentKind::Normal,
621        }
622    }
623}
624
625/// A segment with optional leading slash.
626#[derive(Debug, Clone)]
627struct PathSegment {
628    /// Presence of a leading slash.
629    has_leading_slash: bool,
630    /// Range of the segment name (without any slashes).
631    name_range: Range<usize>,
632}
633
634impl PathSegment {
635    /// Returns the segment without any slashes.
636    #[inline]
637    #[must_use]
638    fn segment<'a>(&self, path: &PathToNormalize<'a>) -> &'a str {
639        if let Some(seg_name) = path.front.get(self.name_range.clone()) {
640            return seg_name;
641        }
642        let front_len = path.front.len();
643        let back = path
644            .back
645            .expect("the segname range implies that the back buffer is filled");
646        let back_range = (self.name_range.start - front_len)..(self.name_range.end - front_len);
647        &back[back_range]
648    }
649
650    /// Returns the segment kind.
651    #[inline]
652    #[must_use]
653    fn kind(&self, path: &PathToNormalize<'_>) -> SegmentKind {
654        SegmentKind::from_segment(self.segment(path))
655    }
656}
657
658/// Iterator of path segments.
659struct PathSegmentsIter<'a> {
660    /// Path.
661    path: &'a PathToNormalize<'a>,
662    /// Current cursor position.
663    ///
664    /// This is the next scan start position. If the previous segment has a
665    /// trailing slash, the value will be the next byte position of the slash.
666    cursor: usize,
667}
668
669impl<'a> PathSegmentsIter<'a> {
670    /// Creates a new iterator of path segments.
671    #[inline]
672    #[must_use]
673    fn new(path: &'a PathToNormalize<'a>) -> Self {
674        Self { path, cursor: 0 }
675    }
676}
677
678impl Iterator for PathSegmentsIter<'_> {
679    type Item = PathSegment;
680
681    fn next(&mut self) -> Option<Self::Item> {
682        let cursor_byte = self.path.byte_at(self.cursor)?;
683        let has_leading_slash = cursor_byte == b'/';
684        let segname_start = if has_leading_slash {
685            // Skip the leading slash.
686            self.cursor + 1
687        } else {
688            self.cursor
689        };
690
691        // Find the trailing slash of the next segment.
692        let segname_end = self
693            .path
694            .find_next_slash_from(segname_start)
695            .unwrap_or_else(|| self.path.len());
696        self.cursor = segname_end;
697        Some(PathSegment {
698            has_leading_slash,
699            name_range: segname_start..segname_end,
700        })
701    }
702}