fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package index 16 17import ( 18 "bytes" 19 "fmt" 20 "math" 21 22 "github.com/sourcegraph/zoekt" 23) 24 25// candidateMatch is a candidate match for a substring. 26// 27// Note: a lot of these can be in memory, so think about fieldalignment when 28// modify the fields of this structure. 29type candidateMatch struct { 30 substrBytes []byte 31 substrLowered []byte 32 33 scoreWeight float64 34 35 file uint32 36 symbolIdx uint32 37 38 // Offsets are relative to the start of the filename or file contents. 39 runeOffset uint32 40 byteOffset uint32 41 byteMatchSz uint32 42 43 // bools at end for struct field alignment 44 caseSensitive bool 45 fileName bool 46 symbol bool 47} 48 49// Matches content against the substring, and populates byteMatchSz on success 50func (m *candidateMatch) matchContent(content []byte) bool { 51 if m.caseSensitive { 52 comp := bytes.Equal(m.substrBytes, content[m.byteOffset:m.byteOffset+uint32(len(m.substrBytes))]) 53 54 m.byteMatchSz = uint32(len(m.substrBytes)) 55 return comp 56 } else { 57 // It is tempting to try a simple ASCII based 58 // comparison if possible, but we need more 59 // information. Simple ASCII chars have unicode upper 60 // case variants (the ASCII 'k' has the Kelvin symbol 61 // as upper case variant). We can only degrade to 62 // ASCII if we are sure that both the corpus and the 63 // query is ASCII only 64 sz, ok := caseFoldingEqualsRunes(m.substrLowered, content[m.byteOffset:]) 65 m.byteMatchSz = uint32(sz) 66 return ok 67 } 68} 69 70// matchIterator is a docIterator that produces candidateMatches for a given document 71type matchIterator interface { 72 docIterator 73 74 candidates() []*candidateMatch 75 76 // updateStats is called twice. After matchtree construction and after 77 // searching is done. Implementations must take care to not report 78 // statistics twice. 79 updateStats(*zoekt.Stats) 80} 81 82// noMatchTree is both matchIterator and matchTree that matches nothing. 83type noMatchTree struct { 84 Why string 85 86 // Stats captures the work done to create the noMatchTree. 87 Stats zoekt.Stats 88} 89 90func (t *noMatchTree) String() string { 91 return fmt.Sprintf("not(%q)", t.Why) 92} 93 94func (t *noMatchTree) candidates() []*candidateMatch { 95 return nil 96} 97 98func (t *noMatchTree) nextDoc() uint32 { 99 return math.MaxUint32 100} 101 102func (t *noMatchTree) prepare(uint32) {} 103 104func (t *noMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 105 return matchesNone 106} 107 108func (t *noMatchTree) updateStats(s *zoekt.Stats) { 109 s.Add(t.Stats) 110 t.Stats = zoekt.Stats{} 111} 112 113func (m *candidateMatch) String() string { 114 return fmt.Sprintf("%d:%d", m.file, m.runeOffset) 115} 116 117type ngramDocIterator struct { 118 leftPad uint32 119 rightPad uint32 120 121 iter hitIterator 122 ends []uint32 123 124 // ngramLookups is how many lookups we did to create this iterator. 125 ngramLookups int 126 127 // mutable 128 fileIdx uint32 129 matchCount int 130} 131 132// nextFileIndex returns the smallest index j of ends such that 133// ends[j] > offset, assuming ends[f] <= offset. 134func nextFileIndex(offset, f uint32, ends []uint32) uint32 { 135 d := uint32(1) 136 for f < uint32(len(ends)) && ends[f] <= offset { 137 if f+d < uint32(len(ends)) && ends[f+d] <= offset { 138 f += d 139 d *= 2 140 } else if d > 1 { 141 d = d/4 + 1 142 } else { 143 f++ 144 } 145 } 146 return f 147} 148 149func (i *ngramDocIterator) nextDoc() uint32 { 150 i.fileIdx = nextFileIndex(i.iter.first(), i.fileIdx, i.ends) 151 if i.fileIdx >= uint32(len(i.ends)) { 152 return math.MaxUint32 153 } 154 return i.fileIdx 155} 156 157func (i *ngramDocIterator) String() string { 158 return fmt.Sprintf("ngram(L=%d,R=%d,%v)", i.leftPad, i.rightPad, i.iter) 159} 160 161func (i *ngramDocIterator) prepare(nextDoc uint32) { 162 var start uint32 163 if nextDoc > 0 { 164 start = i.ends[nextDoc-1] 165 } 166 if start > 0 { 167 i.iter.next(start + i.leftPad - 1) 168 } 169 i.fileIdx = nextDoc 170} 171 172func (i *ngramDocIterator) updateStats(s *zoekt.Stats) { 173 i.iter.updateStats(s) 174 s.NgramMatches += i.matchCount 175 s.NgramLookups += i.ngramLookups 176 i.matchCount = 0 177 i.ngramLookups = 0 178} 179 180func (i *ngramDocIterator) candidates() []*candidateMatch { 181 if i.fileIdx >= uint32(len(i.ends)) { 182 return nil 183 } 184 185 var fileStart uint32 186 if i.fileIdx > 0 { 187 fileStart = i.ends[i.fileIdx-1] 188 } 189 fileEnd := i.ends[i.fileIdx] 190 191 var candidates []*candidateMatch 192 for { 193 p1 := i.iter.first() 194 if p1 == math.MaxUint32 || p1 >= i.ends[i.fileIdx] { 195 break 196 } 197 i.iter.next(p1) 198 199 if p1 < i.leftPad+fileStart || p1+i.rightPad > fileEnd { 200 continue 201 } 202 203 candidates = append(candidates, &candidateMatch{ 204 file: uint32(i.fileIdx), 205 runeOffset: p1 - fileStart - i.leftPad, 206 }) 207 } 208 i.matchCount += len(candidates) 209 return candidates 210}