fork of https://github.com/sourcegraph/zoekt
0

Configure Feed

Select the types of activity you want to include in your feed.

1// Copyright 2016 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package index 16 17import ( 18 "bytes" 19 "fmt" 20 21 "github.com/sourcegraph/zoekt" 22) 23 24// candidateMatch is a candidate match for a substring. 25// 26// Note: a lot of these can be in memory, so think about fieldalignment when 27// modify the fields of this structure. 28type candidateMatch struct { 29 substrBytes []byte 30 substrLowered []byte 31 32 scoreWeight float64 33 34 file uint32 35 symbolIdx uint32 36 37 // Offsets are relative to the start of the filename or file contents. 38 runeOffset uint32 39 byteOffset uint32 40 byteMatchSz uint32 41 42 // bools at end for struct field alignment 43 caseSensitive bool 44 fileName bool 45 symbol bool 46} 47 48// Matches content against the substring, and populates byteMatchSz on success 49func (m *candidateMatch) matchContent(content []byte) bool { 50 if m.caseSensitive { 51 comp := bytes.Equal(m.substrBytes, content[m.byteOffset:m.byteOffset+uint32(len(m.substrBytes))]) 52 53 m.byteMatchSz = uint32(len(m.substrBytes)) 54 return comp 55 } else { 56 // It is tempting to try a simple ASCII based 57 // comparison if possible, but we need more 58 // information. Simple ASCII chars have unicode upper 59 // case variants (the ASCII 'k' has the Kelvin symbol 60 // as upper case variant). We can only degrade to 61 // ASCII if we are sure that both the corpus and the 62 // query is ASCII only 63 sz, ok := caseFoldingEqualsRunes(m.substrLowered, content[m.byteOffset:]) 64 m.byteMatchSz = uint32(sz) 65 return ok 66 } 67} 68 69// matchIterator is a docIterator that produces candidateMatches for a given document 70type matchIterator interface { 71 docIterator 72 73 candidates() []*candidateMatch 74 75 // updateStats is called twice. After matchtree construction and after 76 // searching is done. Implementations must take care to not report 77 // statistics twice. 78 updateStats(*zoekt.Stats) 79} 80 81// noMatchTree is both matchIterator and matchTree that matches nothing. 82type noMatchTree struct { 83 Why string 84 85 // Stats captures the work done to create the noMatchTree. 86 Stats zoekt.Stats 87} 88 89func (t *noMatchTree) String() string { 90 return fmt.Sprintf("not(%q)", t.Why) 91} 92 93func (t *noMatchTree) candidates() []*candidateMatch { 94 return nil 95} 96 97func (t *noMatchTree) nextDoc() uint32 { 98 return maxUInt32 99} 100 101func (t *noMatchTree) prepare(uint32) {} 102 103func (t *noMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState { 104 return matchesNone 105} 106 107func (t *noMatchTree) updateStats(s *zoekt.Stats) { 108 s.Add(t.Stats) 109 t.Stats = zoekt.Stats{} 110} 111 112func (m *candidateMatch) String() string { 113 return fmt.Sprintf("%d:%d", m.file, m.runeOffset) 114} 115 116type ngramDocIterator struct { 117 leftPad uint32 118 rightPad uint32 119 120 iter hitIterator 121 ends []uint32 122 123 // ngramLookups is how many lookups we did to create this iterator. 124 ngramLookups int 125 126 // mutable 127 fileIdx uint32 128 matchCount int 129} 130 131// nextFileIndex returns the smallest index j of ends such that 132// ends[j] > offset, assuming ends[f] <= offset. 133func nextFileIndex(offset, f uint32, ends []uint32) uint32 { 134 d := uint32(1) 135 for f < uint32(len(ends)) && ends[f] <= offset { 136 if f+d < uint32(len(ends)) && ends[f+d] <= offset { 137 f += d 138 d *= 2 139 } else if d > 1 { 140 d = d/4 + 1 141 } else { 142 f++ 143 } 144 } 145 return f 146} 147 148func (i *ngramDocIterator) nextDoc() uint32 { 149 i.fileIdx = nextFileIndex(i.iter.first(), i.fileIdx, i.ends) 150 if i.fileIdx >= uint32(len(i.ends)) { 151 return maxUInt32 152 } 153 return i.fileIdx 154} 155 156func (i *ngramDocIterator) String() string { 157 return fmt.Sprintf("ngram(L=%d,R=%d,%v)", i.leftPad, i.rightPad, i.iter) 158} 159 160func (i *ngramDocIterator) prepare(nextDoc uint32) { 161 var start uint32 162 if nextDoc > 0 { 163 start = i.ends[nextDoc-1] 164 } 165 if start > 0 { 166 i.iter.next(start + i.leftPad - 1) 167 } 168 i.fileIdx = nextDoc 169} 170 171func (i *ngramDocIterator) updateStats(s *zoekt.Stats) { 172 i.iter.updateStats(s) 173 s.NgramMatches += i.matchCount 174 s.NgramLookups += i.ngramLookups 175 i.matchCount = 0 176 i.ngramLookups = 0 177} 178 179func (i *ngramDocIterator) candidates() []*candidateMatch { 180 if i.fileIdx >= uint32(len(i.ends)) { 181 return nil 182 } 183 184 var fileStart uint32 185 if i.fileIdx > 0 { 186 fileStart = i.ends[i.fileIdx-1] 187 } 188 fileEnd := i.ends[i.fileIdx] 189 190 var candidates []*candidateMatch 191 for { 192 p1 := i.iter.first() 193 if p1 == maxUInt32 || p1 >= i.ends[i.fileIdx] { 194 break 195 } 196 i.iter.next(p1) 197 198 if p1 < i.leftPad+fileStart || p1+i.rightPad > fileEnd { 199 continue 200 } 201 202 candidates = append(candidates, &candidateMatch{ 203 file: uint32(i.fileIdx), 204 runeOffset: p1 - fileStart - i.leftPad, 205 }) 206 } 207 i.matchCount += len(candidates) 208 return candidates 209}