# This file is a part of Julia. License is MIT: https://julialang.org/license """ StringIndexError(str, i) An error occurred when trying to access `str` at index `i` that is not valid. """ struct StringIndexError <: Exception string::AbstractString index::Integer end @noinline string_index_err(s::AbstractString, i::Integer) = throw(StringIndexError(s, Int(i))) function Base.showerror(io::IO, exc::StringIndexError) s = exc.string print(io, "StringIndexError: ", "invalid index [$(exc.index)]") if firstindex(s) <= exc.index <= ncodeunits(s) iprev = thisind(s, exc.index) inext = nextind(s, iprev) if inext <= ncodeunits(s) print(io, ", valid nearby indices [$iprev]=>'$(s[iprev])', [$inext]=>'$(s[inext])'") else print(io, ", valid nearby index [$iprev]=>'$(s[iprev])'") end end end const ByteArray = Union{Vector{UInt8},Vector{Int8}} @inline between(b::T, lo::T, hi::T) where {T<:Integer} = (lo ≤ b) & (b ≤ hi) ## constructors and conversions ## # String constructor docstring from boot.jl, workaround for #16730 # and the unavailability of @doc in boot.jl context. """ String(v::AbstractVector{UInt8}) Create a new `String` object from a byte vector `v` containing UTF-8 encoded characters. If `v` is `Vector{UInt8}` it will be truncated to zero length and future modification of `v` cannot affect the contents of the resulting string. To avoid truncation of `Vector{UInt8}` data, use `String(copy(v))`; for other `AbstractVector` types, `String(v)` already makes a copy. When possible, the memory of `v` will be used without copying when the `String` object is created. This is guaranteed to be the case for byte vectors returned by [`take!`](@ref) on a writable [`IOBuffer`](@ref) and by calls to [`read(io, nb)`](@ref). This allows zero-copy conversion of I/O data to strings. In other cases, `Vector{UInt8}` data may be copied, but `v` is truncated anyway to guarantee consistent behavior. """ String(v::AbstractVector{UInt8}) = String(copyto!(StringVector(length(v)), v)) String(v::Vector{UInt8}) = ccall(:jl_array_to_string, Ref{String}, (Any,), v) """ unsafe_string(p::Ptr{UInt8}, [length::Integer]) Copy a string from the address of a C-style (NUL-terminated) string encoded as UTF-8. (The pointer can be safely freed afterwards.) If `length` is specified (the length of the data in bytes), the string does not have to be NUL-terminated. This function is labeled "unsafe" because it will crash if `p` is not a valid memory address to data of the requested length. """ function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}}, len::Integer) p == C_NULL && throw(ArgumentError("cannot convert NULL to string")) ccall(:jl_pchar_to_string, Ref{String}, (Ptr{UInt8}, Int), p, len) end function unsafe_string(p::Union{Ptr{UInt8},Ptr{Int8}}) p == C_NULL && throw(ArgumentError("cannot convert NULL to string")) ccall(:jl_cstr_to_string, Ref{String}, (Ptr{UInt8},), p) end _string_n(n::Integer) = ccall(:jl_alloc_string, Ref{String}, (Csize_t,), n) """ String(s::AbstractString) Convert a string to a contiguous byte array representation encoded as UTF-8 bytes. This representation is often appropriate for passing strings to C. """ String(s::AbstractString) = print_to_string(s) @pure String(s::Symbol) = unsafe_string(unsafe_convert(Ptr{UInt8}, s)) unsafe_wrap(::Type{Vector{UInt8}}, s::String) = ccall(:jl_string_to_array, Ref{Vector{UInt8}}, (Any,), s) Vector{UInt8}(s::CodeUnits{UInt8,String}) = copyto!(Vector{UInt8}(undef, length(s)), s) Vector{UInt8}(s::String) = Vector{UInt8}(codeunits(s)) Array{UInt8}(s::String) = Vector{UInt8}(codeunits(s)) String(s::CodeUnits{UInt8,String}) = s.s ## low-level functions ## pointer(s::String) = unsafe_convert(Ptr{UInt8}, s) pointer(s::String, i::Integer) = pointer(s) + Int(i)::Int - 1 @pure ncodeunits(s::String) = Core.sizeof(s) codeunit(s::String) = UInt8 @inline function codeunit(s::String, i::Integer) @boundscheck checkbounds(s, i) b = GC.@preserve s unsafe_load(pointer(s, i)) return b end ## comparison ## _memcmp(a::Union{Ptr{UInt8},AbstractString}, b::Union{Ptr{UInt8},AbstractString}, len) = ccall(:memcmp, Cint, (Ptr{UInt8}, Ptr{UInt8}, Csize_t), a, b, len % Csize_t) % Int function cmp(a::String, b::String) al, bl = sizeof(a), sizeof(b) c = _memcmp(a, b, min(al,bl)) return c < 0 ? -1 : c > 0 ? +1 : cmp(al,bl) end function ==(a::String, b::String) pointer_from_objref(a) == pointer_from_objref(b) && return true al = sizeof(a) return al == sizeof(b) && 0 == _memcmp(a, b, al) end typemin(::Type{String}) = "" typemin(::String) = typemin(String) ## thisind, nextind ## @propagate_inbounds thisind(s::String, i::Int) = _thisind_str(s, i) # s should be String or SubString{String} @inline function _thisind_str(s, i::Int) i == 0 && return 0 n = ncodeunits(s) i == n + 1 && return i @boundscheck between(i, 1, n) || throw(BoundsError(s, i)) @inbounds b = codeunit(s, i) (b & 0xc0 == 0x80) & (i-1 > 0) || return i @inbounds b = codeunit(s, i-1) between(b, 0b11000000, 0b11110111) && return i-1 (b & 0xc0 == 0x80) & (i-2 > 0) || return i @inbounds b = codeunit(s, i-2) between(b, 0b11100000, 0b11110111) && return i-2 (b & 0xc0 == 0x80) & (i-3 > 0) || return i @inbounds b = codeunit(s, i-3) between(b, 0b11110000, 0b11110111) && return i-3 return i end @propagate_inbounds nextind(s::String, i::Int) = _nextind_str(s, i) # s should be String or SubString{String} @inline function _nextind_str(s, i::Int) i == 0 && return 1 n = ncodeunits(s) @boundscheck between(i, 1, n) || throw(BoundsError(s, i)) @inbounds l = codeunit(s, i) (l < 0x80) | (0xf8 ≤ l) && return i+1 if l < 0xc0 i′ = @inbounds thisind(s, i) return i′ < i ? @inbounds(nextind(s, i′)) : i+1 end # first continuation byte (i += 1) > n && return i @inbounds b = codeunit(s, i) b & 0xc0 ≠ 0x80 && return i ((i += 1) > n) | (l < 0xe0) && return i # second continuation byte @inbounds b = codeunit(s, i) b & 0xc0 ≠ 0x80 && return i ((i += 1) > n) | (l < 0xf0) && return i # third continuation byte @inbounds b = codeunit(s, i) ifelse(b & 0xc0 ≠ 0x80, i, i+1) end ## checking UTF-8 & ACSII validity ## byte_string_classify(s::Union{String,Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}}}) = ccall(:u8_isvalid, Int32, (Ptr{UInt8}, Int), s, sizeof(s)) # 0: neither valid ASCII nor UTF-8 # 1: valid ASCII # 2: valid UTF-8 isvalid(::Type{String}, s::Union{Vector{UInt8},FastContiguousSubArray{UInt8,1,Vector{UInt8}},String}) = byte_string_classify(s) ≠ 0 isvalid(s::String) = isvalid(String, s) is_valid_continuation(c) = c & 0xc0 == 0x80 ## required core functionality ## @inline function iterate(s::String, i::Int=firstindex(s)) (i % UInt) - 1 < ncodeunits(s) || return nothing b = @inbounds codeunit(s, i) u = UInt32(b) << 24 between(b, 0x80, 0xf7) || return reinterpret(Char, u), i+1 return iterate_continued(s, i, u) end function iterate_continued(s::String, i::Int, u::UInt32) u < 0xc0000000 && (i += 1; @goto ret) n = ncodeunits(s) # first continuation byte (i += 1) > n && @goto ret @inbounds b = codeunit(s, i) b & 0xc0 == 0x80 || @goto ret u |= UInt32(b) << 16 # second continuation byte ((i += 1) > n) | (u < 0xe0000000) && @goto ret @inbounds b = codeunit(s, i) b & 0xc0 == 0x80 || @goto ret u |= UInt32(b) << 8 # third continuation byte ((i += 1) > n) | (u < 0xf0000000) && @goto ret @inbounds b = codeunit(s, i) b & 0xc0 == 0x80 || @goto ret u |= UInt32(b); i += 1 @label ret return reinterpret(Char, u), i end @propagate_inbounds function getindex(s::String, i::Int) b = codeunit(s, i) u = UInt32(b) << 24 between(b, 0x80, 0xf7) || return reinterpret(Char, u) return getindex_continued(s, i, u) end function getindex_continued(s::String, i::Int, u::UInt32) if u < 0xc0000000 # called from `getindex` which checks bounds @inbounds isvalid(s, i) && @goto ret string_index_err(s, i) end n = ncodeunits(s) (i += 1) > n && @goto ret @inbounds b = codeunit(s, i) # cont byte 1 b & 0xc0 == 0x80 || @goto ret u |= UInt32(b) << 16 ((i += 1) > n) | (u < 0xe0000000) && @goto ret @inbounds b = codeunit(s, i) # cont byte 2 b & 0xc0 == 0x80 || @goto ret u |= UInt32(b) << 8 ((i += 1) > n) | (u < 0xf0000000) && @goto ret @inbounds b = codeunit(s, i) # cont byte 3 b & 0xc0 == 0x80 || @goto ret u |= UInt32(b) @label ret return reinterpret(Char, u) end getindex(s::String, r::UnitRange{<:Integer}) = s[Int(first(r)):Int(last(r))] @inline function getindex(s::String, r::UnitRange{Int}) isempty(r) && return "" i, j = first(r), last(r) @boundscheck begin checkbounds(s, r) @inbounds isvalid(s, i) || string_index_err(s, i) @inbounds isvalid(s, j) || string_index_err(s, j) end j = nextind(s, j) - 1 n = j - i + 1 ss = _string_n(n) GC.@preserve s ss unsafe_copyto!(pointer(ss), pointer(s, i), n) return ss end length(s::String) = length_continued(s, 1, ncodeunits(s), ncodeunits(s)) @inline function length(s::String, i::Int, j::Int) @boundscheck begin 0 < i ≤ ncodeunits(s)+1 || throw(BoundsError(s, i)) 0 ≤ j < ncodeunits(s)+1 || throw(BoundsError(s, j)) end j < i && return 0 @inbounds i, k = thisind(s, i), i c = j - i + (i == k) length_continued(s, i, j, c) end @inline function length_continued(s::String, i::Int, n::Int, c::Int) i < n || return c @inbounds b = codeunit(s, i) @inbounds while true while true (i += 1) ≤ n || return c 0xc0 ≤ b ≤ 0xf7 && break b = codeunit(s, i) end l = b b = codeunit(s, i) # cont byte 1 c -= (x = b & 0xc0 == 0x80) x & (l ≥ 0xe0) || continue (i += 1) ≤ n || return c b = codeunit(s, i) # cont byte 2 c -= (x = b & 0xc0 == 0x80) x & (l ≥ 0xf0) || continue (i += 1) ≤ n || return c b = codeunit(s, i) # cont byte 3 c -= (b & 0xc0 == 0x80) end end ## overload methods for efficiency ## isvalid(s::String, i::Int) = checkbounds(Bool, s, i) && thisind(s, i) == i function isascii(s::String) @inbounds for i = 1:ncodeunits(s) codeunit(s, i) >= 0x80 && return false end return true end """ repeat(c::AbstractChar, r::Integer) -> String Repeat a character `r` times. This can equivalently be accomplished by calling [`c^r`](@ref :^(::Union{AbstractString, AbstractChar}, ::Integer)). # Examples ```jldoctest julia> repeat('A', 3) "AAA" ``` """ repeat(c::AbstractChar, r::Integer) = repeat(Char(c), r) # fallback function repeat(c::Char, r::Integer) r == 0 && return "" r < 0 && throw(ArgumentError("can't repeat a character $r times")) u = bswap(reinterpret(UInt32, c)) n = 4 - (leading_zeros(u | 0xff) >> 3) s = _string_n(n*r) p = pointer(s) GC.@preserve s if n == 1 ccall(:memset, Ptr{Cvoid}, (Ptr{UInt8}, Cint, Csize_t), p, u % UInt8, r) elseif n == 2 p16 = reinterpret(Ptr{UInt16}, p) for i = 1:r unsafe_store!(p16, u % UInt16, i) end elseif n == 3 b1 = (u >> 0) % UInt8 b2 = (u >> 8) % UInt8 b3 = (u >> 16) % UInt8 for i = 0:r-1 unsafe_store!(p, b1, 3i + 1) unsafe_store!(p, b2, 3i + 2) unsafe_store!(p, b3, 3i + 3) end elseif n == 4 p32 = reinterpret(Ptr{UInt32}, p) for i = 1:r unsafe_store!(p32, u, i) end end return s end