matchtree.go at 27effbadc957cb77a48098ce69455f4caeaaa7e2 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / matchtree.go
at 27effbadc957cb77a48098ce69455f4caeaaa7e2 28 kB View raw
Keegan Carruthers-Smith matchtree: special case word search (#526) 3y ago
   1// Copyright 2018 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15package zoekt
  16
  17import (
  18	"bytes"
  19	"fmt"
  20	"log"
  21	"regexp/syntax"
  22	"strings"
  23	"unicode/utf8"
  24
  25	"github.com/grafana/regexp"
  26
  27	"github.com/sourcegraph/zoekt/query"
  28)
  29
  30// A docIterator iterates over documents in order.
  31type docIterator interface {
  32	// provide the next document where we can may find something
  33	// interesting.
  34	nextDoc() uint32
  35
  36	// clears any per-document state of the docIterator, and
  37	// prepares for evaluating the given doc. The argument is
  38	// strictly increasing over time.
  39	prepare(nextDoc uint32)
  40}
  41
  42const (
  43	costConst   = 0
  44	costMemory  = 1
  45	costContent = 2
  46	costRegexp  = 3
  47)
  48
  49const (
  50	costMin = costConst
  51	costMax = costRegexp
  52)
  53
  54// An expression tree coupled with matches. The matchtree has two
  55// functions:
  56//
  57// * it implements boolean combinations (and, or, not)
  58//
  59// * it implements shortcuts, where we skip documents (for example: if
  60// there are no trigram matches, we can be sure there are no substring
  61// matches). The matchtree iterates over the documents as they are
  62// ordered in the shard.
  63//
  64// The general process for a given (shard, query) is:
  65//
  66// - construct matchTree for the query
  67//
  68// - find all different leaf matchTrees (substring, regexp, etc.)
  69//
  70// in a loop:
  71//
  72//   - find next doc to process using nextDoc
  73//
  74//   - evaluate atoms (leaf expressions that match text)
  75//
  76//   - evaluate the tree using matches(), storing the result in map.
  77//
  78//   - if the complete tree returns (matches() == true) for the document,
  79//     collect all text matches by looking at leaf matchTrees
  80type matchTree interface {
  81	docIterator
  82
  83	// returns whether this matches, and if we are sure.
  84	matches(cp *contentProvider, cost int, known map[matchTree]bool) (match bool, sure bool)
  85}
  86
  87// docMatchTree iterates over documents for which predicate(docID) returns true.
  88type docMatchTree struct {
  89	// the number of documents in a shard.
  90	numDocs uint32
  91
  92	predicate func(docID uint32) bool
  93
  94	// provides additional information about the reason why the docMatchTree was
  95	// created.
  96	reason string
  97
  98	// mutable
  99	firstDone bool
 100	docID     uint32
 101}
 102
 103type bruteForceMatchTree struct {
 104	// mutable
 105	firstDone bool
 106	docID     uint32
 107}
 108
 109type andLineMatchTree struct {
 110	andMatchTree
 111}
 112
 113type andMatchTree struct {
 114	children []matchTree
 115}
 116
 117type orMatchTree struct {
 118	children []matchTree
 119}
 120
 121type notMatchTree struct {
 122	child matchTree
 123}
 124
 125// Returns only the filename of child matches.
 126type fileNameMatchTree struct {
 127	child matchTree
 128}
 129
 130// Don't visit this subtree for collecting matches.
 131type noVisitMatchTree struct {
 132	matchTree
 133}
 134
 135type regexpMatchTree struct {
 136	regexp *regexp.Regexp
 137
 138	fileName bool
 139
 140	// mutable
 141	reEvaluated bool
 142	found       []*candidateMatch
 143
 144	// nextDoc, prepare.
 145	bruteForceMatchTree
 146}
 147
 148// \bLITERAL\b
 149type wordMatchTree struct {
 150	word string
 151
 152	fileName bool
 153
 154	// mutable
 155	evaluated bool
 156	found     []*candidateMatch
 157
 158	// nextDoc, prepare.
 159	bruteForceMatchTree
 160}
 161
 162type substrMatchTree struct {
 163	matchIterator
 164
 165	query         *query.Substring
 166	caseSensitive bool
 167	fileName      bool
 168
 169	// mutable
 170	current       []*candidateMatch
 171	contEvaluated bool
 172}
 173
 174type branchQueryMatchTree struct {
 175	fileMasks []uint64
 176	masks     []uint64
 177	repos     []uint16
 178
 179	// mutable
 180	firstDone bool
 181	docID     uint32
 182}
 183
 184type symbolRegexpMatchTree struct {
 185	matchTree
 186	regexp *regexp.Regexp
 187	all    bool // skips regex match if .*
 188
 189	reEvaluated bool
 190	found       []*candidateMatch
 191}
 192
 193func (t *symbolRegexpMatchTree) prepare(doc uint32) {
 194	t.reEvaluated = false
 195}
 196
 197func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 198	if t.reEvaluated {
 199		return len(t.found) > 0, true
 200	}
 201
 202	if cost < costRegexp {
 203		return false, false
 204	}
 205
 206	sections := cp.docSections()
 207	content := cp.data(false)
 208
 209	found := t.found[:0]
 210	for i, sec := range sections {
 211		var idx []int
 212		if t.all {
 213			idx = []int{0, int(sec.End - sec.Start)}
 214		} else {
 215			idx = t.regexp.FindIndex(content[sec.Start:sec.End])
 216			if idx == nil {
 217				continue
 218			}
 219		}
 220
 221		cm := &candidateMatch{
 222			byteOffset:  sec.Start + uint32(idx[0]),
 223			byteMatchSz: uint32(idx[1] - idx[0]),
 224			symbol:      true,
 225			symbolIdx:   uint32(i),
 226		}
 227		found = append(found, cm)
 228	}
 229	t.found = found
 230	t.reEvaluated = true
 231
 232	return len(t.found) > 0, true
 233}
 234
 235type symbolSubstrMatchTree struct {
 236	*substrMatchTree
 237
 238	patternSize   uint32
 239	fileEndRunes  []uint32
 240	fileEndSymbol []uint32
 241
 242	doc      uint32
 243	sections []DocumentSection
 244
 245	secID uint32
 246}
 247
 248func (t *symbolSubstrMatchTree) prepare(doc uint32) {
 249	t.substrMatchTree.prepare(doc)
 250	t.doc = doc
 251
 252	var fileStart uint32
 253	if doc > 0 {
 254		fileStart = t.fileEndRunes[doc-1]
 255	}
 256
 257	var sections []DocumentSection
 258	if len(t.sections) > 0 {
 259		most := t.fileEndSymbol[len(t.fileEndSymbol)-1]
 260		if most == uint32(len(t.sections)) {
 261			sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]]
 262		} else {
 263			for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart {
 264				t.secID++
 265			}
 266
 267			fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID
 268			for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd {
 269				symbolEnd++
 270			}
 271
 272			sections = t.sections[t.secID:symbolEnd]
 273		}
 274	}
 275
 276	secIdx := 0
 277	trimmed := t.current[:0]
 278	for len(sections) > secIdx && len(t.current) > 0 {
 279		start := fileStart + t.current[0].runeOffset
 280		end := start + t.patternSize
 281		if start >= sections[secIdx].End {
 282			secIdx++
 283			continue
 284		}
 285
 286		if start < sections[secIdx].Start {
 287			t.current = t.current[1:]
 288			continue
 289		}
 290
 291		if end <= sections[secIdx].End {
 292			t.current[0].symbol = true
 293			t.current[0].symbolIdx = uint32(secIdx)
 294			trimmed = append(trimmed, t.current[0])
 295		}
 296
 297		t.current = t.current[1:]
 298	}
 299	t.current = trimmed
 300}
 301
 302// all prepare methods
 303
 304func (t *bruteForceMatchTree) prepare(doc uint32) {
 305	t.docID = doc
 306	t.firstDone = true
 307}
 308
 309func (t *docMatchTree) prepare(doc uint32) {
 310	t.docID = doc
 311	t.firstDone = true
 312}
 313
 314func (t *andMatchTree) prepare(doc uint32) {
 315	for _, c := range t.children {
 316		c.prepare(doc)
 317	}
 318}
 319
 320func (t *regexpMatchTree) prepare(doc uint32) {
 321	t.found = t.found[:0]
 322	t.reEvaluated = false
 323	t.bruteForceMatchTree.prepare(doc)
 324}
 325
 326func (t *wordMatchTree) prepare(doc uint32) {
 327	t.found = t.found[:0]
 328	t.evaluated = false
 329	t.bruteForceMatchTree.prepare(doc)
 330}
 331
 332func (t *orMatchTree) prepare(doc uint32) {
 333	for _, c := range t.children {
 334		c.prepare(doc)
 335	}
 336}
 337
 338func (t *notMatchTree) prepare(doc uint32) {
 339	t.child.prepare(doc)
 340}
 341
 342func (t *fileNameMatchTree) prepare(doc uint32) {
 343	t.child.prepare(doc)
 344}
 345
 346func (t *substrMatchTree) prepare(nextDoc uint32) {
 347	t.matchIterator.prepare(nextDoc)
 348	t.current = t.matchIterator.candidates()
 349	t.contEvaluated = false
 350}
 351
 352func (t *branchQueryMatchTree) prepare(doc uint32) {
 353	t.firstDone = true
 354	t.docID = doc
 355}
 356
 357// nextDoc
 358
 359func (t *docMatchTree) nextDoc() uint32 {
 360	var start uint32
 361	if t.firstDone {
 362		start = t.docID + 1
 363	}
 364	for i := start; i < t.numDocs; i++ {
 365		if t.predicate(i) {
 366			return i
 367		}
 368	}
 369	return maxUInt32
 370}
 371
 372func (t *bruteForceMatchTree) nextDoc() uint32 {
 373	if !t.firstDone {
 374		return 0
 375	}
 376	return t.docID + 1
 377}
 378
 379func (t *andMatchTree) nextDoc() uint32 {
 380	var max uint32
 381	for _, c := range t.children {
 382		m := c.nextDoc()
 383		if m > max {
 384			max = m
 385		}
 386	}
 387	return max
 388}
 389
 390func (t *orMatchTree) nextDoc() uint32 {
 391	min := uint32(maxUInt32)
 392	for _, c := range t.children {
 393		m := c.nextDoc()
 394		if m < min {
 395			min = m
 396		}
 397	}
 398	return min
 399}
 400
 401func (t *notMatchTree) nextDoc() uint32 {
 402	return 0
 403}
 404
 405func (t *fileNameMatchTree) nextDoc() uint32 {
 406	return t.child.nextDoc()
 407}
 408
 409func (t *branchQueryMatchTree) nextDoc() uint32 {
 410	var start uint32
 411	if t.firstDone {
 412		start = t.docID + 1
 413	}
 414
 415	for i := start; i < uint32(len(t.fileMasks)); i++ {
 416		if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 {
 417			return i
 418		}
 419	}
 420	return maxUInt32
 421}
 422
 423// all String methods
 424
 425func (t *bruteForceMatchTree) String() string {
 426	return "all"
 427}
 428
 429func (t *docMatchTree) String() string {
 430	return fmt.Sprintf("doc(%s)", t.reason)
 431}
 432
 433func (t *andMatchTree) String() string {
 434	return fmt.Sprintf("and%v", t.children)
 435}
 436
 437func (t *regexpMatchTree) String() string {
 438	f := ""
 439	if t.fileName {
 440		f = "f"
 441	}
 442	return fmt.Sprintf("%sre(%s)", f, t.regexp)
 443}
 444
 445func (t *wordMatchTree) String() string {
 446	f := ""
 447	if t.fileName {
 448		f = "f"
 449	}
 450	return fmt.Sprintf("%sword(%s)", f, t.word)
 451}
 452
 453func (t *orMatchTree) String() string {
 454	return fmt.Sprintf("or%v", t.children)
 455}
 456
 457func (t *notMatchTree) String() string {
 458	return fmt.Sprintf("not(%v)", t.child)
 459}
 460
 461func (t *noVisitMatchTree) String() string {
 462	return fmt.Sprintf("novisit(%v)", t.matchTree)
 463}
 464
 465func (t *fileNameMatchTree) String() string {
 466	return fmt.Sprintf("f(%v)", t.child)
 467}
 468
 469func (t *substrMatchTree) String() string {
 470	f := ""
 471	if t.fileName {
 472		f = "f"
 473	}
 474
 475	return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator)
 476}
 477
 478func (t *branchQueryMatchTree) String() string {
 479	return fmt.Sprintf("branch(%x)", t.masks)
 480}
 481
 482func (t *symbolSubstrMatchTree) String() string {
 483	return fmt.Sprintf("symbol(%v)", t.substrMatchTree)
 484}
 485
 486func (t *symbolRegexpMatchTree) String() string {
 487	return fmt.Sprintf("symbol(%v)", t.matchTree)
 488}
 489
 490// visitMatches visits all atoms in matchTree. Note: This visits
 491// noVisitMatchTree. For collecting matches use visitMatches.
 492func visitMatchTree(t matchTree, f func(matchTree)) {
 493	switch s := t.(type) {
 494	case *andMatchTree:
 495		for _, ch := range s.children {
 496			visitMatchTree(ch, f)
 497		}
 498	case *orMatchTree:
 499		for _, ch := range s.children {
 500			visitMatchTree(ch, f)
 501		}
 502	case *andLineMatchTree:
 503		visitMatchTree(&s.andMatchTree, f)
 504	case *noVisitMatchTree:
 505		visitMatchTree(s.matchTree, f)
 506	case *notMatchTree:
 507		visitMatchTree(s.child, f)
 508	case *fileNameMatchTree:
 509		visitMatchTree(s.child, f)
 510	case *symbolSubstrMatchTree:
 511		visitMatchTree(s.substrMatchTree, f)
 512	case *symbolRegexpMatchTree:
 513		visitMatchTree(s.matchTree, f)
 514	default:
 515		f(t)
 516	}
 517}
 518
 519// visitMatches visits all atoms which can contribute matches. Note: This
 520// skips noVisitMatchTree.
 521func visitMatches(t matchTree, known map[matchTree]bool, f func(matchTree)) {
 522	switch s := t.(type) {
 523	case *andMatchTree:
 524		for _, ch := range s.children {
 525			if known[ch] {
 526				visitMatches(ch, known, f)
 527			}
 528		}
 529	case *andLineMatchTree:
 530		visitMatches(&s.andMatchTree, known, f)
 531	case *orMatchTree:
 532		for _, ch := range s.children {
 533			if known[ch] {
 534				visitMatches(ch, known, f)
 535			}
 536		}
 537	case *symbolSubstrMatchTree:
 538		visitMatches(s.substrMatchTree, known, f)
 539	case *notMatchTree:
 540	case *noVisitMatchTree:
 541		// don't collect into negative trees.
 542	case *fileNameMatchTree:
 543		// We will just gather the filename if we do not visit this tree.
 544	default:
 545		f(s)
 546	}
 547}
 548
 549// all matches() methods.
 550
 551func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 552	return t.predicate(cp.idx), true
 553}
 554
 555func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 556	return true, true
 557}
 558
 559// andLineMatchTree is a performance optimization of andMatchTree. For content
 560// searches we don't want to run the regex engine if there is no line that
 561// contains matches from all terms.
 562func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 563	matches, sure := t.andMatchTree.matches(cp, cost, known)
 564	if !(sure && matches) {
 565		return matches, sure
 566	}
 567
 568	// find child with fewest candidates
 569	min := maxUInt32
 570	fewestChildren := 0
 571	for ix, child := range t.children {
 572		v, ok := child.(*substrMatchTree)
 573		// make sure we are running a content search and that all candidates are a
 574		// substrMatchTree
 575		if !ok || v.fileName {
 576			return matches, sure
 577		}
 578		if len(v.current) < min {
 579			min = len(v.current)
 580			fewestChildren = ix
 581		}
 582	}
 583
 584	type lineRange struct {
 585		start int
 586		end   int
 587	}
 588	lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current))
 589	prev := -1
 590	for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current {
 591		line, byteStart, byteEnd := cp.newlines().atOffset(candidate.byteOffset)
 592		if line == prev {
 593			continue
 594		}
 595		prev = line
 596		lines = append(lines, lineRange{byteStart, byteEnd})
 597	}
 598
 599	// children keeps track of the children's candidates we have already seen.
 600	children := make([][]*candidateMatch, 0, len(t.children)-1)
 601	for j, child := range t.children {
 602		if j == fewestChildren {
 603			continue
 604		}
 605		children = append(children, child.(*substrMatchTree).current)
 606	}
 607
 608nextLine:
 609	for i := 0; i < len(lines); i++ {
 610		hits := 1
 611	nextChild:
 612		for j := range children {
 613		nextCandidate:
 614			for len(children[j]) > 0 {
 615				candidate := children[j][0]
 616				bo := int(cp.findOffset(false, candidate.runeOffset))
 617				if bo < lines[i].start {
 618					children[j] = children[j][1:]
 619					continue nextCandidate
 620				}
 621				if bo <= lines[i].end {
 622					hits++
 623					continue nextChild
 624				}
 625				// move the `lines` iterator forward until bo <= line.end
 626				for i < len(lines) && bo > lines[i].end {
 627					i++
 628				}
 629				i--
 630				continue nextLine
 631			}
 632		}
 633		// return early once we found any line that contains matches from all children
 634		if hits == len(t.children) {
 635			return matches, true
 636		}
 637	}
 638	return false, true
 639}
 640
 641func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 642	sure := true
 643
 644	for _, ch := range t.children {
 645		v, ok := evalMatchTree(cp, cost, known, ch)
 646		if ok && !v {
 647			return false, true
 648		}
 649		if !ok {
 650			sure = false
 651		}
 652	}
 653
 654	return true, sure
 655}
 656
 657func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 658	matches := false
 659	sure := true
 660	for _, ch := range t.children {
 661		v, ok := evalMatchTree(cp, cost, known, ch)
 662		if ok {
 663			// we could short-circuit, but we want to use
 664			// the other possibilities as a ranking
 665			// signal.
 666			matches = matches || v
 667		} else {
 668			sure = false
 669		}
 670	}
 671	return matches, sure
 672}
 673
 674func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 675	return t.fileMasks[t.docID]&t.masks[t.repos[t.docID]] != 0, true
 676}
 677
 678func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 679	if t.reEvaluated {
 680		return len(t.found) > 0, true
 681	}
 682
 683	if cost < costRegexp {
 684		return false, false
 685	}
 686
 687	cp.stats.RegexpsConsidered++
 688	idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1)
 689	found := t.found[:0]
 690	for _, idx := range idxs {
 691		cm := &candidateMatch{
 692			byteOffset:  uint32(idx[0]),
 693			byteMatchSz: uint32(idx[1] - idx[0]),
 694			fileName:    t.fileName,
 695		}
 696
 697		found = append(found, cm)
 698	}
 699	t.found = found
 700	t.reEvaluated = true
 701
 702	return len(t.found) > 0, true
 703}
 704
 705func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 706	if t.evaluated {
 707		return len(t.found) > 0, true
 708	}
 709
 710	if cost < costRegexp {
 711		return false, false
 712	}
 713
 714	data := cp.data(t.fileName)
 715	offset := 0
 716	found := t.found[:0]
 717	for {
 718		idx := bytes.Index(data[offset:], []byte(t.word))
 719		if idx < 0 {
 720			break
 721		}
 722
 723		relStartOffset := offset + idx
 724		relEndOffset := relStartOffset + len(t.word)
 725
 726		startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1]))
 727		endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset]))
 728		if startBoundary && endBoundary {
 729			found = append(found, &candidateMatch{
 730				byteOffset:  uint32(offset + idx),
 731				byteMatchSz: uint32(len(t.word)),
 732				fileName:    t.fileName,
 733			})
 734		}
 735		offset += idx + len(t.word)
 736	}
 737
 738	t.found = found
 739	t.evaluated = true
 740
 741	return len(t.found) > 0, true
 742}
 743
 744// breakMatchesOnNewlines returns matches resulting from breaking each element
 745// of cms on newlines within text.
 746func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch {
 747	var lineCMs []*candidateMatch
 748	for _, cm := range cms {
 749		lineCMs = append(lineCMs, breakOnNewlines(cm, text)...)
 750	}
 751	return lineCMs
 752}
 753
 754// breakOnNewlines returns matches resulting from breaking cm on newlines
 755// within text.
 756func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch {
 757	var cms []*candidateMatch
 758	addMe := &candidateMatch{}
 759	*addMe = *cm
 760	for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ {
 761		if text[i] == '\n' {
 762			addMe.byteMatchSz = i - addMe.byteOffset
 763			if addMe.byteMatchSz != 0 {
 764				cms = append(cms, addMe)
 765			}
 766
 767			addMe = &candidateMatch{}
 768			*addMe = *cm
 769			addMe.byteOffset = i + 1
 770		}
 771	}
 772	addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset
 773	if addMe.byteMatchSz != 0 {
 774		cms = append(cms, addMe)
 775	}
 776	return cms
 777}
 778
 779func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) (bool, bool) {
 780	if v, ok := known[mt]; ok {
 781		return v, true
 782	}
 783
 784	v, ok := mt.matches(cp, cost, known)
 785	if ok {
 786		known[mt] = v
 787	}
 788
 789	return v, ok
 790}
 791
 792func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 793	v, ok := evalMatchTree(cp, cost, known, t.child)
 794	return !v, ok
 795}
 796
 797func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 798	return evalMatchTree(cp, cost, known, t.child)
 799}
 800
 801func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 802	if t.contEvaluated {
 803		return len(t.current) > 0, true
 804	}
 805
 806	if len(t.current) == 0 {
 807		return false, true
 808	}
 809
 810	if t.fileName && cost < costMemory {
 811		return false, false
 812	}
 813
 814	if !t.fileName && cost < costContent {
 815		return false, false
 816	}
 817
 818	pruned := t.current[:0]
 819	for _, m := range t.current {
 820		if m.byteOffset == 0 && m.runeOffset > 0 {
 821			m.byteOffset = cp.findOffset(m.fileName, m.runeOffset)
 822		}
 823		if m.matchContent(cp.data(m.fileName)) {
 824			pruned = append(pruned, m)
 825		}
 826	}
 827	t.current = pruned
 828	t.contEvaluated = true
 829
 830	return len(t.current) > 0, true
 831}
 832
 833func (d *indexData) newMatchTree(q query.Q) (matchTree, error) {
 834	if q == nil {
 835		return nil, fmt.Errorf("got nil (sub)query")
 836	}
 837	switch s := q.(type) {
 838	case *query.Regexp:
 839		// RegexpToMatchTreeRecursive tries to distill a matchTree that matches a
 840		// superset of the regexp. If the returned matchTree is equivalent to the
 841		// original regexp, it returns true. An equivalent matchTree has the same
 842		// behaviour as the original regexp and can be used instead.
 843		//
 844		subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive)
 845		if err != nil {
 846			return nil, err
 847		}
 848		// if the query can be used in place of the regexp
 849		// return the subtree
 850		if isEq {
 851			return subMT, nil
 852		}
 853
 854		var tr matchTree
 855		if wmt, ok := regexpToWordMatchTree(s); ok {
 856			// A common search we get is "\bLITERAL\b". Avoid the regex engine and
 857			// provide something faster.
 858			tr = wmt
 859		} else {
 860			prefix := ""
 861			if !s.CaseSensitive {
 862				prefix = "(?i)"
 863			}
 864
 865			tr = &regexpMatchTree{
 866				regexp:   regexp.MustCompile(prefix + s.Regexp.String()),
 867				fileName: s.FileName,
 868			}
 869		}
 870
 871		return &andMatchTree{
 872			children: []matchTree{
 873				tr, &noVisitMatchTree{subMT},
 874			},
 875		}, nil
 876	case *query.And:
 877		var r []matchTree
 878		for _, ch := range s.Children {
 879			ct, err := d.newMatchTree(ch)
 880			if err != nil {
 881				return nil, err
 882			}
 883			r = append(r, ct)
 884		}
 885		return &andMatchTree{r}, nil
 886	case *query.Or:
 887		var r []matchTree
 888		for _, ch := range s.Children {
 889			ct, err := d.newMatchTree(ch)
 890			if err != nil {
 891				return nil, err
 892			}
 893			r = append(r, ct)
 894		}
 895		return &orMatchTree{r}, nil
 896	case *query.Not:
 897		ct, err := d.newMatchTree(s.Child)
 898		return &notMatchTree{
 899			child: ct,
 900		}, err
 901
 902	case *query.Type:
 903		if s.Type != query.TypeFileName {
 904			break
 905		}
 906
 907		ct, err := d.newMatchTree(s.Child)
 908		if err != nil {
 909			return nil, err
 910		}
 911
 912		return &fileNameMatchTree{
 913			child: ct,
 914		}, nil
 915
 916	case *query.Substring:
 917		return d.newSubstringMatchTree(s)
 918
 919	case *query.Branch:
 920		masks := make([]uint64, 0, len(d.repoMetaData))
 921		if s.Pattern == "HEAD" {
 922			for i := 0; i < len(d.repoMetaData); i++ {
 923				masks = append(masks, 1)
 924			}
 925		} else {
 926			for _, branchIDs := range d.branchIDs {
 927				mask := uint64(0)
 928				for nm, m := range branchIDs {
 929					if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) {
 930						mask |= uint64(m)
 931					}
 932				}
 933				masks = append(masks, mask)
 934			}
 935
 936		}
 937		return &branchQueryMatchTree{
 938			masks:     masks,
 939			fileMasks: d.fileBranchMasks,
 940			repos:     d.repos,
 941		}, nil
 942	case *query.Const:
 943		if s.Value {
 944			return &bruteForceMatchTree{}, nil
 945		} else {
 946			return &noMatchTree{"const"}, nil
 947		}
 948	case *query.Language:
 949		code, ok := d.metaData.LanguageMap[s.Language]
 950		if !ok {
 951			return &noMatchTree{"lang"}, nil
 952		}
 953		return &docMatchTree{
 954			reason:  "language",
 955			numDocs: d.numDocs(),
 956			predicate: func(docID uint32) bool {
 957				return d.getLanguage(docID) == code
 958			},
 959		}, nil
 960
 961	case *query.Symbol:
 962		subMT, err := d.newMatchTree(s.Expr)
 963		if err != nil {
 964			return nil, err
 965		}
 966
 967		if substr, ok := subMT.(*substrMatchTree); ok {
 968			// We have a feature flag for lazy decoding. If runeDocSectionsRaw is
 969			// non-nil it means we need to lazily decode on request.
 970			sections := d.runeDocSections
 971			if sections == nil && d.runeDocSectionsRaw != nil {
 972				sections = unmarshalDocSections(d.runeDocSectionsRaw, nil)
 973			}
 974
 975			return &symbolSubstrMatchTree{
 976				substrMatchTree: substr,
 977				patternSize:     uint32(utf8.RuneCountInString(substr.query.Pattern)),
 978				fileEndRunes:    d.fileEndRunes,
 979				fileEndSymbol:   d.fileEndSymbol,
 980				sections:        sections,
 981			}, nil
 982		}
 983
 984		var regexp *regexp.Regexp
 985		visitMatchTree(subMT, func(mt matchTree) {
 986			if t, ok := mt.(*regexpMatchTree); ok {
 987				regexp = t.regexp
 988			}
 989		})
 990		if regexp == nil {
 991			return nil, fmt.Errorf("found %T inside query.Symbol", subMT)
 992		}
 993
 994		return &symbolRegexpMatchTree{
 995			regexp:    regexp,
 996			all:       regexp.String() == "(?i)(?-s:.)*",
 997			matchTree: subMT,
 998		}, nil
 999
1000	case *query.FileNameSet:
1001		return &docMatchTree{
1002			reason:  "FileNameSet",
1003			numDocs: d.numDocs(),
1004			predicate: func(docID uint32) bool {
1005				fileName := d.fileName(docID)
1006				_, ok := s.Set[string(fileName)]
1007				return ok
1008			},
1009		}, nil
1010
1011	case *query.BranchesRepos:
1012		reposBranchesWant := make([]uint64, len(d.repoMetaData))
1013		for repoIdx := range d.repoMetaData {
1014			var mask uint64
1015			for _, br := range s.List {
1016				if br.Repos.Contains(d.repoMetaData[repoIdx].ID) {
1017					mask |= uint64(d.branchIDs[repoIdx][br.Branch])
1018				}
1019			}
1020			reposBranchesWant[repoIdx] = mask
1021		}
1022		return &docMatchTree{
1023			reason:  "BranchesRepos",
1024			numDocs: d.numDocs(),
1025			predicate: func(docID uint32) bool {
1026				return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0
1027			},
1028		}, nil
1029
1030	case *query.RepoSet:
1031		reposWant := make([]bool, len(d.repoMetaData))
1032		for repoIdx, r := range d.repoMetaData {
1033			if _, ok := s.Set[r.Name]; ok {
1034				reposWant[repoIdx] = true
1035			}
1036		}
1037		return &docMatchTree{
1038			reason:  "RepoSet",
1039			numDocs: d.numDocs(),
1040			predicate: func(docID uint32) bool {
1041				return reposWant[d.repos[docID]]
1042			},
1043		}, nil
1044
1045	case *query.RepoIDs:
1046		reposWant := make([]bool, len(d.repoMetaData))
1047		for repoIdx, r := range d.repoMetaData {
1048			if s.Repos.Contains(r.ID) {
1049				reposWant[repoIdx] = true
1050			}
1051		}
1052		return &docMatchTree{
1053			reason:  "RepoIDs",
1054			numDocs: d.numDocs(),
1055			predicate: func(docID uint32) bool {
1056				return reposWant[d.repos[docID]]
1057			},
1058		}, nil
1059
1060	case *query.Repo:
1061		reposWant := make([]bool, len(d.repoMetaData))
1062		for repoIdx, r := range d.repoMetaData {
1063			if s.Regexp.MatchString(r.Name) {
1064				reposWant[repoIdx] = true
1065			}
1066		}
1067		return &docMatchTree{
1068			reason:  "Repo",
1069			numDocs: d.numDocs(),
1070			predicate: func(docID uint32) bool {
1071				return reposWant[d.repos[docID]]
1072			},
1073		}, nil
1074
1075	case *query.RepoRegexp:
1076		reposWant := make([]bool, len(d.repoMetaData))
1077		for repoIdx, r := range d.repoMetaData {
1078			if s.Regexp.MatchString(r.Name) {
1079				reposWant[repoIdx] = true
1080			}
1081		}
1082		return &docMatchTree{
1083			reason:  "RepoRegexp",
1084			numDocs: d.numDocs(),
1085			predicate: func(docID uint32) bool {
1086				return reposWant[d.repos[docID]]
1087			},
1088		}, nil
1089
1090	case query.RawConfig:
1091		return &docMatchTree{
1092			reason:  s.String(),
1093			numDocs: d.numDocs(),
1094			predicate: func(docID uint32) bool {
1095				return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s)
1096			},
1097		}, nil
1098	}
1099	log.Panicf("type %T", q)
1100	return nil, nil
1101}
1102
1103func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) {
1104	st := &substrMatchTree{
1105		query:         s,
1106		caseSensitive: s.CaseSensitive,
1107		fileName:      s.FileName,
1108	}
1109
1110	if utf8.RuneCountInString(s.Pattern) < ngramSize {
1111		prefix := ""
1112		if !s.CaseSensitive {
1113			prefix = "(?i)"
1114		}
1115		t := &regexpMatchTree{
1116			regexp:   regexp.MustCompile(prefix + regexp.QuoteMeta(s.Pattern)),
1117			fileName: s.FileName,
1118		}
1119		return t, nil
1120	}
1121
1122	result, err := d.iterateNgrams(s)
1123	if err != nil {
1124		return nil, err
1125	}
1126	st.matchIterator = result
1127	return st, nil
1128}
1129
1130func regexpToWordMatchTree(q *query.Regexp) (_ *wordMatchTree, ok bool) {
1131	// Needs to be case sensitive
1132	if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 {
1133		return nil, false
1134	}
1135	// We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary]
1136	if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 {
1137		return nil, false
1138	}
1139	sub := q.Regexp.Sub
1140	if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary {
1141		return nil, false
1142	}
1143
1144	return &wordMatchTree{
1145		word:     string(sub[1].Rune),
1146		fileName: q.FileName,
1147	}, true
1148}
1149
1150// pruneMatchTree removes impossible branches from the matchTree, as indicated
1151// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth.
1152func pruneMatchTree(mt matchTree) (matchTree, error) {
1153	var err error
1154	switch mt := mt.(type) {
1155	// leaf nodes that we test with the filter:
1156	case *substrMatchTree:
1157		if res, ok := mt.matchIterator.(*ngramIterationResults); ok {
1158			if _, ok := res.matchIterator.(*noMatchTree); ok {
1159				return nil, nil
1160			}
1161		}
1162	// recursive tree structures:
1163	case *andMatchTree:
1164		// Any branch of an and becoming impossible means the entire clause
1165		// is impossible. Otherwise, just handle rewrites.
1166		for i, child := range mt.children {
1167			newChild, err := pruneMatchTree(child)
1168			if err != nil {
1169				return nil, err
1170			}
1171			if newChild == nil {
1172				return nil, nil
1173			}
1174			mt.children[i] = newChild
1175		}
1176	case *orMatchTree:
1177		// *All* branches of an OR becoming impossible means the entire clause
1178		// is impossible. Otherwise, drop impossible subclauses and handle
1179		// rewrites, including simplifying to a singular resulting child branch.
1180		n := 0
1181		for _, child := range mt.children {
1182			newChild, err := pruneMatchTree(child)
1183			if err != nil {
1184				return nil, err
1185			}
1186			if newChild != nil {
1187				mt.children[n] = newChild
1188				n++
1189			}
1190		}
1191		mt.children = mt.children[:n]
1192		if len(mt.children) == 1 {
1193			return mt.children[0], nil
1194		} else if len(mt.children) == 0 {
1195			return nil, nil
1196		}
1197	case *noVisitMatchTree:
1198		mt.matchTree, err = pruneMatchTree(mt.matchTree)
1199		if err != nil {
1200			return nil, err
1201		}
1202		if mt.matchTree == nil {
1203			return nil, nil
1204		}
1205	case *fileNameMatchTree:
1206		mt.child, err = pruneMatchTree(mt.child)
1207	case *andLineMatchTree:
1208		child, err := pruneMatchTree(&mt.andMatchTree)
1209		if err != nil {
1210			return nil, err
1211		}
1212		if child == nil {
1213			return nil, nil
1214		}
1215		if c, ok := child.(*andMatchTree); ok {
1216			mt.andMatchTree = *c
1217		} else {
1218			// the and was simplified to a single clause,
1219			// so the linematch portion is irrelevant.
1220			return mt, nil
1221		}
1222	case *notMatchTree:
1223		mt.child, err = pruneMatchTree(mt.child)
1224		if err != nil {
1225			return nil, err
1226		}
1227		if mt.child == nil {
1228			// not false => true
1229			return &bruteForceMatchTree{}, nil
1230		}
1231	// unhandled:
1232	case *docMatchTree:
1233	case *bruteForceMatchTree:
1234	case *regexpMatchTree:
1235	case *wordMatchTree:
1236	}
1237	return mt, err
1238}
Configure Feed

Configure Feed