utf8.go 2.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. /*
  2. * Copyright 2022 ByteDance Inc.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. package utf8
  17. import (
  18. `github.com/bytedance/sonic/internal/rt`
  19. `github.com/bytedance/sonic/internal/native/types`
  20. `github.com/bytedance/sonic/internal/native`
  21. )
  22. // CorrectWith corrects the invalid utf8 byte with repl string.
  23. func CorrectWith(dst []byte, src []byte, repl string) []byte {
  24. sstr := rt.Mem2Str(src)
  25. sidx := 0
  26. /* state machine records the invalid postions */
  27. m := types.NewStateMachine()
  28. m.Sp = 0 // invalid utf8 numbers
  29. for sidx < len(sstr) {
  30. scur := sidx
  31. ecode := native.ValidateUTF8(&sstr, &sidx, m)
  32. if m.Sp != 0 {
  33. if m.Sp > len(sstr) {
  34. panic("numbers of invalid utf8 exceed the string len!")
  35. }
  36. }
  37. for i := 0; i < m.Sp; i++ {
  38. ipos := m.Vt[i] // invalid utf8 position
  39. dst = append(dst, sstr[scur:ipos]...)
  40. dst = append(dst, repl...)
  41. scur = m.Vt[i] + 1
  42. }
  43. /* append the remained valid utf8 bytes */
  44. dst = append(dst, sstr[scur:sidx]...)
  45. /* not enough space, reset and continue */
  46. if ecode != 0 {
  47. m.Sp = 0
  48. }
  49. }
  50. types.FreeStateMachine(m)
  51. return dst
  52. }
  53. // Validate is a simd-accelereated drop-in replacement for the standard library's utf8.Valid.
  54. func Validate(src []byte) bool {
  55. return ValidateString(rt.Mem2Str(src))
  56. }
  57. // ValidateString as Validate, but for string.
  58. func ValidateString(src string) bool {
  59. return native.ValidateUTF8Fast(&src) == 0
  60. }