1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
use std::iter::FusedIterator;
use std::str::{from_utf8, from_utf8_unchecked};

/// An iterator over chunks of valid UTF-8 in a RawStr.
///
/// See [`RawStr::utf8_chunks`](struct.RawStr.html#method.utf8_chunks).
pub struct Utf8ChunksIter<'a> {
	pub(super) bytes: &'a [u8],
}

/// A chunk of valid UTF-8, possibly followed by a broken character encoding.
pub struct Utf8Chunk<'a> {
	/// A valid UTF-8 piece, at the start, end, or between broken chars.
	///
	/// Empty between adjacent broken chars.
	pub valid: &'a str,

	/// A broken char.
	///
	/// Can only be empty in the last chunk.
	///
	/// Should be replaced by a single unicode replacement character, if not empty.
	pub broken: &'a [u8],
}

impl<'a> Iterator for Utf8ChunksIter<'a> {
	type Item = Utf8Chunk<'a>;

	fn next(&mut self) -> Option<Utf8Chunk<'a>> {
		if self.bytes.is_empty() {
			return None;
		}
		match from_utf8(self.bytes) {
			Ok(s) => {
				self.bytes = &self.bytes[s.len()..];
				Some(Utf8Chunk {
					valid: s,
					broken: &self.bytes[..0],
				})
			}
			Err(e) => {
				let (valid, rest) = self.bytes.split_at(e.valid_up_to());
				let valid = unsafe { from_utf8_unchecked(valid) };
				let (broken, rest) = rest.split_at(e.error_len().unwrap_or(rest.len()));
				self.bytes = rest;
				Some(Utf8Chunk { valid, broken })
			}
		}
	}

	#[inline]
	fn size_hint(&self) -> (usize, Option<usize>) {
		if self.bytes.is_empty() {
			(0, Some(0))
		} else {
			(1, None)
		}
	}
}

impl<'a> FusedIterator for Utf8ChunksIter<'a> {}