fork of https://github.com/sourcegraph/zoekt
1// Copyright 2021 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package zoekt
16
17import (
18 "fmt"
19 "math/rand"
20 "sort"
21 "testing"
22)
23
24func TestMakeArrayNgramOffset(t *testing.T) {
25 for n, tc := range []struct {
26 ngrams []string
27 offsets []uint32
28 }{
29 {nil, nil},
30 {[]string{"ant", "any", "awl", "big", "bin", "bit", "can", "con"}, []uint32{0, 2, 5, 8, 10, 14, 18, 25, 30}},
31 } {
32 ngrams := []ngram{}
33 for _, s := range tc.ngrams {
34 ngrams = append(ngrams, stringToNGram(s))
35 }
36 m := makeArrayNgramOffset(ngrams, tc.offsets)
37 t.Logf("makeNgramOffset(%v, %v) => %s", tc.ngrams, tc.offsets, &m)
38 failn := stringToNGram("foo")
39 if getFail := m.Get(failn); getFail != (simpleSection{}) {
40 t.Errorf("#%d: Get(%q) got %v, want zero", n, failn, getFail)
41 }
42 for i := 0; i < len(tc.offsets)-1; i++ {
43 want := simpleSection{tc.offsets[i], tc.offsets[i+1] - tc.offsets[i]}
44 got := m.Get(ngrams[i])
45 if want != got {
46 t.Errorf("#%d.%d: Get(%q) got %v, want %v", n, i, tc.ngrams[i], got, want)
47 }
48 failn := ngram(uint64(ngrams[i] - 1))
49 if getFail := m.Get(failn); getFail != (simpleSection{}) {
50 t.Errorf("#%d.%d: Get(%q) got %v, want zero", n, i, failn, getFail)
51 }
52 failn = ngram(uint64(ngrams[i] + 1))
53 if getFail := m.Get(failn); getFail != (simpleSection{}) {
54 t.Errorf("#%d.%d: Get(%q) got %v, want zero", n, i, failn, getFail)
55 }
56 }
57 }
58}
59
60func TestMakeCombinedNgramOffset(t *testing.T) {
61 // The ascii / unicode ngram offset splitting is significantly
62 // more complicated. Exercise it with a more comprehensive test!
63 unicodeProbability := 0.2
64 ngramCount := 1000
65 ngramMap := map[ngram]bool{}
66
67 rng := rand.New(rand.NewSource(42))
68
69 randRune := func() rune {
70 if rng.Float64() < unicodeProbability {
71 return rune(0x100 + rand.Intn(0x80)) // Emoji
72 }
73 return rune('A' + rng.Intn('Z'-'A')) // A letter
74 }
75
76 for len(ngramMap) < ngramCount {
77 ngramMap[runesToNGram([3]rune{randRune(), randRune(), randRune()})] = true
78 }
79
80 ngrams := []ngram{}
81 for ng := range ngramMap {
82 ngrams = append(ngrams, ng)
83 }
84 sort.Slice(ngrams, func(i, j int) bool { return ngrams[i] < ngrams[j] })
85
86 offset := uint32(0)
87 offsets := []uint32{0}
88
89 for i := 0; i < len(ngrams); i++ {
90 // vary
91 offset += uint32(ngramAsciiMaxSectionLength/2 + rand.Intn(ngramAsciiMaxSectionLength))
92 offsets = append(offsets, offset)
93 }
94
95 m := makeCombinedNgramOffset(ngrams, offsets)
96
97 for i, ng := range ngrams {
98 want := simpleSection{offsets[i], offsets[i+1] - offsets[i]}
99 got := m.Get(ng)
100 if want != got {
101 t.Errorf("#%d: Get(%q) got %v, want %v", i, ng, got, want)
102 }
103 failn := ngram(uint64(ng - 1))
104 if getFail := m.Get(failn); !ngramMap[failn] && getFail != (simpleSection{}) {
105 t.Errorf("#%d: Get(%q) got %v, want zero", i, failn, getFail)
106 }
107 failn = ngram(uint64(ng + 1))
108 if getFail := m.Get(failn); !ngramMap[failn] && getFail != (simpleSection{}) {
109 t.Errorf("#%d: Get(%q) got %v, want zero", i, failn, getFail)
110 }
111 }
112
113 if t.Failed() || true {
114 t.Log(ngrams)
115 t.Log(offsets)
116 t.Log(m)
117 }
118}
119
120func (a combinedNgramOffset) String() string {
121 return fmt.Sprintf("combinedNgramOffset{\n asc: %s,\n uni: %s,\n}", a.asc, a.uni)
122}
123
124func (a *arrayNgramOffset) String() string {
125 o := "arrayNgramOffset{tops:{"
126 for i, p := range a.tops {
127 if i > 0 {
128 o += ", "
129 }
130 if p.top&1023 == 0 {
131 // only one rune is represented here
132 o += fmt.Sprintf("%s: %d", string(rune(p.top>>10)), p.off)
133 } else {
134 o += fmt.Sprintf("0x%x: %d", p.top>>10, p.off)
135 }
136 }
137 o += "}, bots: {"
138 for i, p := range a.bots {
139 if i > 0 {
140 o += ", "
141 }
142 if p < (256 << 21) {
143 // two ascii-ish runes (probably)
144 o += fmt.Sprintf("%s%s", string(rune(p>>21)), string(rune(p&runeMask)))
145 } else {
146 o += fmt.Sprintf("0x%x", p)
147 }
148 }
149 o += fmt.Sprintf("}, offsets: %v}", a.offsets)
150 return o
151}
152
153func (a *asciiNgramOffset) String() string {
154 o := "asciiNgramOffset{entries:{"
155 for i, e := range a.entries {
156 ng := ngramAsciiPackedToNgram(ngramAscii(uint32(e) >> 11))
157 length := e & ngramAsciiMaxSectionLength
158 if i > 0 {
159 o += ", "
160 }
161 o += fmt.Sprintf("%s: %d", ng, length)
162 }
163 o += fmt.Sprintf("}, chunkOffsets: %v}", a.chunkOffsets)
164 return o
165
166}