utf8

package utf8

import "unicode/utf8"

utf8 包实现了支持 UTF-8 编码文本的函数和常量。它包含在 rune 和 UTF-8 字节序列之间进行转换的函数。参见 https://en.wikipedia.org/wiki/UTF-8

Index

Constants
func AppendRune(p []byte, r rune) []byte
func DecodeLastRune(p []byte) (r rune, size int)
func DecodeLastRuneInString(s string) (r rune, size int)
func DecodeRune(p []byte) (r rune, size int)
func DecodeRuneInString(s string) (r rune, size int)
func EncodeRune(p []byte, r rune) int
func FullRune(p []byte) bool
func FullRuneInString(s string) bool
func RuneCount(p []byte) int
func RuneCountInString(s string) (n int)
func RuneLen(r rune) int
func RuneStart(b byte) bool
func Valid(p []byte) bool
func ValidRune(r rune) bool
func ValidString(s string) bool

Examples

AppendRune
DecodeLastRune
DecodeLastRuneInString
DecodeRune
DecodeRuneInString
EncodeRune
EncodeRune (OutOfRange)
FullRune
FullRuneInString
RuneCount
RuneCountInString
RuneLen
RuneStart
Valid
ValidRune
ValidString

Constants

const (
	RuneError = '\uFFFD'     // "错误" Rune 或 "Unicode 替换字符"
	RuneSelf  = 0x80         // 低于 RuneSelf 的字符在单个字节中以自身表示。
	MaxRune   = '\U0010FFFF' // 最大有效 Unicode 码点。
	UTFMax    = 4            // UTF-8 编码的 Unicode 字符的最大字节数。
)

编码的基本数值。

Functions

func AppendRune

func AppendRune(p []byte, r rune) []byte

AppendRune 将 r 的 UTF-8 编码追加到 p 的末尾并返回扩展后的缓冲区。如果 rune 超出范围，则追加 RuneError 的编码。

Example

package main

import (
	"fmt"
	"unicode/utf8"
)

func main() {
	buf1 := utf8.AppendRune(nil, 0x10000)
	buf2 := utf8.AppendRune([]byte("init"), 0x10000)
	fmt.Println(string(buf1))
	fmt.Println(string(buf2))
}

Output:

𐀀
init𐀀

func DecodeLastRune

func DecodeLastRune(p []byte) (r rune, size int)

DecodeLastRune 解包 p 中的最后一个 UTF-8 编码并返回该 rune 及其字节宽度。如果 p 为空，则返回 (RuneError, 0)。否则，如果编码无效，则返回 (RuneError, 1)。对于正确的非空 UTF-8，这两种结果都是不可能的。

如果编码不是正确的 UTF-8、编码的 rune 超出范围、或不是该值最短的 UTF-8 编码，则该编码无效。不执行其他验证。

Example

package main

import (
	"fmt"
	"unicode/utf8"
)

func main() {
	b := []byte("Hello, 世界")

	for len(b) > 0 {
		r, size := utf8.DecodeLastRune(b)
		fmt.Printf("%c %v\n", r, size)

		b = b[:len(b)-size]
	}
}

Output:

界 3
世 3
  1
, 1
o 1
l 1
l 1
e 1
H 1

func DecodeLastRuneInString

func DecodeLastRuneInString(s string) (r rune, size int)

DecodeLastRuneInString 类似于 DecodeLastRune，但其输入为字符串。如果 s 为空，则返回 (RuneError, 0)。否则，如果编码无效，则返回 (RuneError, 1)。对于正确的非空 UTF-8，这两种结果都是不可能的。

如果编码不是正确的 UTF-8、编码的 rune 超出范围、或不是该值最短的 UTF-8 编码，则该编码无效。不执行其他验证。

Example

package main

import (
	"fmt"
	"unicode/utf8"
)

func main() {
	str := "Hello, 世界"

	for len(str) > 0 {
		r, size := utf8.DecodeLastRuneInString(str)
		fmt.Printf("%c %v\n", r, size)

		str = str[:len(str)-size]
	}
}

Output:

界 3
世 3
  1
, 1
o 1
l 1
l 1
e 1
H 1

func DecodeRune

func DecodeRune(p []byte) (r rune, size int)

DecodeRune 解包 p 中的第一个 UTF-8 编码并返回该 rune 及其字节宽度。如果 p 为空，则返回 (RuneError, 0)。否则，如果编码无效，则返回 (RuneError, 1)。对于正确的非空 UTF-8，这两种结果都是不可能的。

如果编码不是正确的 UTF-8、编码的 rune 超出范围、或不是该值最短的 UTF-8 编码，则该编码无效。不执行其他验证。

Example

package main

import (
	"fmt"
	"unicode/utf8"
)

func main() {
	b := []byte("Hello, 世界")

	for len(b) > 0 {
		r, size := utf8.DecodeRune(b)
		fmt.Printf("%c %v\n", r, size)

		b = b[size:]
	}
}

Output:

H 1
e 1
l 1
l 1
o 1
, 1
  1
世 3
界 3

func DecodeRuneInString

func DecodeRuneInString(s string) (r rune, size int)

DecodeRuneInString 类似于 DecodeRune，但其输入为字符串。如果 s 为空，则返回 (RuneError, 0)。否则，如果编码无效，则返回 (RuneError, 1)。对于正确的非空 UTF-8，这两种结果都是不可能的。

如果编码不是正确的 UTF-8、编码的 rune 超出范围、或不是该值最短的 UTF-8 编码，则该编码无效。不执行其他验证。

Example

package main

import (
	"fmt"
	"unicode/utf8"
)

func main() {
	str := "Hello, 世界"

	for len(str) > 0 {
		r, size := utf8.DecodeRuneInString(str)
		fmt.Printf("%c %v\n", r, size)

		str = str[size:]
	}
}

Output:

H 1
e 1
l 1
l 1
o 1
, 1
  1
世 3
界 3

func EncodeRune

func EncodeRune(p []byte, r rune) int

EncodeRune 将 rune 的 UTF-8 编码写入 p（必须足够大）。如果 rune 超出范围，则写入 RuneError 的编码。返回写入的字节数。

Example

package main

import (
	"fmt"
	"unicode/utf8"
)

func main() {
	r := '世'
	buf := make([]byte, 3)

	n := utf8.EncodeRune(buf, r)

	fmt.Println(buf)
	fmt.Println(n)
}

Output:

[228 184 150]
3

Example (OutOfRange)

package main

import (
	"fmt"
	"unicode/utf8"
)

func main() {
	runes := []rune{
		// Less than 0, out of range.
		-1,
		// Greater than 0x10FFFF, out of range.
		0x110000,
		// The Unicode replacement character.
		utf8.RuneError,
	}
	for i, c := range runes {
		buf := make([]byte, 3)
		size := utf8.EncodeRune(buf, c)
		fmt.Printf("%d: %d %[2]s %d\n", i, buf, size)
	}
}

Output:

0: [239 191 189] � 3
1: [239 191 189] � 3
2: [239 191 189] � 3

func FullRune

func FullRune(p []byte) bool

FullRune 报告 p 中的字节是否以一个完整的 UTF-8 编码的 rune 开头。无效编码被视为完整的 Rune，因为它将作为宽度为 1 的错误 rune 进行转换。

Example

package main

import (
	"fmt"
	"unicode/utf8"
)

func main() {
	buf := []byte{228, 184, 150} // 世
	fmt.Println(utf8.FullRune(buf))
	fmt.Println(utf8.FullRune(buf[:2]))
}

Output:

true
false

func FullRuneInString

func FullRuneInString(s string) bool

FullRuneInString 类似于 FullRune，但其输入为字符串。

Example

package main

import (
	"fmt"
	"unicode/utf8"
)

func main() {
	str := "世"
	fmt.Println(utf8.FullRuneInString(str))
	fmt.Println(utf8.FullRuneInString(str[:2]))
}

Output:

true
false

func RuneCount

func RuneCount(p []byte) int

RuneCount 返回 p 中的 rune 数量。错误和过短的编码被视为宽度为 1 字节的单个 rune。

Example

package main

import (
	"fmt"
	"unicode/utf8"
)

func main() {
	buf := []byte("Hello, 世界")
	fmt.Println("bytes =", len(buf))
	fmt.Println("runes =", utf8.RuneCount(buf))
}

Output:

bytes = 13
runes = 9

func RuneCountInString

func RuneCountInString(s string) (n int)

RuneCountInString 类似于 RuneCount，但其输入为字符串。

Example

package main

import (
	"fmt"
	"unicode/utf8"
)

func main() {
	str := "Hello, 世界"
	fmt.Println("bytes =", len(str))
	fmt.Println("runes =", utf8.RuneCountInString(str))
}

Output:

bytes = 13
runes = 9

func RuneLen

func RuneLen(r rune) int

RuneLen 返回该 rune 的 UTF-8 编码的字节数。如果该 rune 不是可用 UTF-8 编码的有效值，则返回 -1。

Example

package main

import (
	"fmt"
	"unicode/utf8"
)

func main() {
	fmt.Println(utf8.RuneLen('a'))
	fmt.Println(utf8.RuneLen('界'))
}

Output:

1
3

func RuneStart

func RuneStart(b byte) bool

RuneStart 报告该字节是否可能是一个已编码（可能无效）rune 的第一个字节。第二个及后续字节的最高两位始终设置为 10。

Example

package main

import (
	"fmt"
	"unicode/utf8"
)

func main() {
	buf := []byte("a界")
	fmt.Println(utf8.RuneStart(buf[0]))
	fmt.Println(utf8.RuneStart(buf[1]))
	fmt.Println(utf8.RuneStart(buf[2]))
}

Output:

true
true
false

func Valid

func Valid(p []byte) bool

Valid 报告 p 是否完全由有效的 UTF-8 编码的 rune 组成。

Example

package main

import (
	"fmt"
	"unicode/utf8"
)

func main() {
	valid := []byte("Hello, 世界")
	invalid := []byte{0xff, 0xfe, 0xfd}

	fmt.Println(utf8.Valid(valid))
	fmt.Println(utf8.Valid(invalid))
}

Output:

true
false

func ValidRune

func ValidRune(r rune) bool

ValidRune 报告 r 是否可以合法地编码为 UTF-8。超出范围的码点或代理项半值是非法的。

Example

package main

import (
	"fmt"
	"unicode/utf8"
)

func main() {
	valid := 'a'
	invalid := rune(0xfffffff)

	fmt.Println(utf8.ValidRune(valid))
	fmt.Println(utf8.ValidRune(invalid))
}

Output:

true
false

func ValidString

func ValidString(s string) bool

ValidString 报告 s 是否完全由有效的 UTF-8 编码的 rune 组成。

Example

package main

import (
	"fmt"
	"unicode/utf8"
)

func main() {
	valid := "Hello, 世界"
	invalid := string([]byte{0xff, 0xfe, 0xfd})

	fmt.Println(utf8.ValidString(valid))
	fmt.Println(utf8.ValidString(invalid))
}

Output:

true
false