matchtree.go at 5e2620e0cf642a18f3f85da79122b7d1e20e6193 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / matchtree.go
at 5e2620e0cf642a18f3f85da79122b7d1e20e6193 29 kB View raw
Keegan Carruthers-Smith matchtree: call prepare on symbolRegexpMatchTree subtree (#685) 2y ago
   1// Copyright 2018 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15package zoekt
  16
  17import (
  18	"bytes"
  19	"fmt"
  20	"log"
  21	"regexp/syntax"
  22	"strings"
  23	"unicode/utf8"
  24
  25	"github.com/grafana/regexp"
  26
  27	"github.com/sourcegraph/zoekt/query"
  28)
  29
  30// A docIterator iterates over documents in order.
  31type docIterator interface {
  32	// provide the next document where we can may find something
  33	// interesting.
  34	nextDoc() uint32
  35
  36	// clears any per-document state of the docIterator, and
  37	// prepares for evaluating the given doc. The argument is
  38	// strictly increasing over time.
  39	prepare(nextDoc uint32)
  40}
  41
  42const (
  43	costConst   = 0
  44	costMemory  = 1
  45	costContent = 2
  46	costRegexp  = 3
  47)
  48
  49const (
  50	costMin = costConst
  51	costMax = costRegexp
  52)
  53
  54// An expression tree coupled with matches. The matchtree has two
  55// functions:
  56//
  57// * it implements boolean combinations (and, or, not)
  58//
  59// * it implements shortcuts, where we skip documents (for example: if
  60// there are no trigram matches, we can be sure there are no substring
  61// matches). The matchtree iterates over the documents as they are
  62// ordered in the shard.
  63//
  64// The general process for a given (shard, query) is:
  65//
  66// - construct matchTree for the query
  67//
  68// - find all different leaf matchTrees (substring, regexp, etc.)
  69//
  70// in a loop:
  71//
  72//   - find next doc to process using nextDoc
  73//
  74//   - evaluate atoms (leaf expressions that match text)
  75//
  76//   - evaluate the tree using matches(), storing the result in map.
  77//
  78//   - if the complete tree returns (matches() == true) for the document,
  79//     collect all text matches by looking at leaf matchTrees
  80type matchTree interface {
  81	docIterator
  82
  83	// returns whether this matches, and if we are sure.
  84	matches(cp *contentProvider, cost int, known map[matchTree]bool) (match bool, sure bool)
  85}
  86
  87// docMatchTree iterates over documents for which predicate(docID) returns true.
  88type docMatchTree struct {
  89	// the number of documents in a shard.
  90	numDocs uint32
  91
  92	predicate func(docID uint32) bool
  93
  94	// provides additional information about the reason why the docMatchTree was
  95	// created.
  96	reason string
  97
  98	// mutable
  99	firstDone bool
 100	docID     uint32
 101}
 102
 103type bruteForceMatchTree struct {
 104	// mutable
 105	firstDone bool
 106	docID     uint32
 107}
 108
 109type andLineMatchTree struct {
 110	andMatchTree
 111}
 112
 113type andMatchTree struct {
 114	children []matchTree
 115}
 116
 117type orMatchTree struct {
 118	children []matchTree
 119}
 120
 121type notMatchTree struct {
 122	child matchTree
 123}
 124
 125// Returns only the filename of child matches.
 126type fileNameMatchTree struct {
 127	child matchTree
 128}
 129
 130// Don't visit this subtree for collecting matches.
 131type noVisitMatchTree struct {
 132	matchTree
 133}
 134
 135type regexpMatchTree struct {
 136	regexp *regexp.Regexp
 137
 138	fileName bool
 139
 140	// mutable
 141	reEvaluated bool
 142	found       []*candidateMatch
 143
 144	// nextDoc, prepare.
 145	bruteForceMatchTree
 146}
 147
 148// \bLITERAL\b
 149type wordMatchTree struct {
 150	word string
 151
 152	fileName bool
 153
 154	// mutable
 155	evaluated bool
 156	found     []*candidateMatch
 157
 158	// nextDoc, prepare.
 159	bruteForceMatchTree
 160}
 161
 162type substrMatchTree struct {
 163	matchIterator
 164
 165	query         *query.Substring
 166	caseSensitive bool
 167	fileName      bool
 168
 169	// mutable
 170	current       []*candidateMatch
 171	contEvaluated bool
 172}
 173
 174type branchQueryMatchTree struct {
 175	fileMasks []uint64
 176	masks     []uint64
 177	repos     []uint16
 178
 179	// mutable
 180	firstDone bool
 181	docID     uint32
 182}
 183
 184func (t *branchQueryMatchTree) branchMask() uint64 {
 185	return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]]
 186}
 187
 188type symbolRegexpMatchTree struct {
 189	matchTree
 190	regexp *regexp.Regexp
 191	all    bool // skips regex match if .*
 192
 193	reEvaluated bool
 194	found       []*candidateMatch
 195}
 196
 197func (t *symbolRegexpMatchTree) prepare(doc uint32) {
 198	t.reEvaluated = false
 199	t.found = t.found[:0]
 200	t.matchTree.prepare(doc)
 201}
 202
 203func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 204	if t.reEvaluated {
 205		return len(t.found) > 0, true
 206	}
 207
 208	if cost < costRegexp {
 209		return false, false
 210	}
 211
 212	sections := cp.docSections()
 213	content := cp.data(false)
 214
 215	found := t.found[:0]
 216	for i, sec := range sections {
 217		var idx []int
 218		if t.all {
 219			idx = []int{0, int(sec.End - sec.Start)}
 220		} else {
 221			idx = t.regexp.FindIndex(content[sec.Start:sec.End])
 222			if idx == nil {
 223				continue
 224			}
 225		}
 226
 227		cm := &candidateMatch{
 228			byteOffset:  sec.Start + uint32(idx[0]),
 229			byteMatchSz: uint32(idx[1] - idx[0]),
 230			symbol:      true,
 231			symbolIdx:   uint32(i),
 232		}
 233		found = append(found, cm)
 234	}
 235	t.found = found
 236	t.reEvaluated = true
 237
 238	return len(t.found) > 0, true
 239}
 240
 241type symbolSubstrMatchTree struct {
 242	*substrMatchTree
 243
 244	patternSize   uint32
 245	fileEndRunes  []uint32
 246	fileEndSymbol []uint32
 247
 248	doc      uint32
 249	sections []DocumentSection
 250
 251	secID uint32
 252}
 253
 254func (t *symbolSubstrMatchTree) prepare(doc uint32) {
 255	t.substrMatchTree.prepare(doc)
 256	t.doc = doc
 257
 258	var fileStart uint32
 259	if doc > 0 {
 260		fileStart = t.fileEndRunes[doc-1]
 261	}
 262
 263	var sections []DocumentSection
 264	if len(t.sections) > 0 {
 265		most := t.fileEndSymbol[len(t.fileEndSymbol)-1]
 266		if most == uint32(len(t.sections)) {
 267			sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]]
 268		} else {
 269			for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart {
 270				t.secID++
 271			}
 272
 273			fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID
 274			for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd {
 275				symbolEnd++
 276			}
 277
 278			sections = t.sections[t.secID:symbolEnd]
 279		}
 280	}
 281
 282	secIdx := 0
 283	trimmed := t.current[:0]
 284	for len(sections) > secIdx && len(t.current) > 0 {
 285		start := fileStart + t.current[0].runeOffset
 286		end := start + t.patternSize
 287		if start >= sections[secIdx].End {
 288			secIdx++
 289			continue
 290		}
 291
 292		if start < sections[secIdx].Start {
 293			t.current = t.current[1:]
 294			continue
 295		}
 296
 297		if end <= sections[secIdx].End {
 298			t.current[0].symbol = true
 299			t.current[0].symbolIdx = uint32(secIdx)
 300			trimmed = append(trimmed, t.current[0])
 301		}
 302
 303		t.current = t.current[1:]
 304	}
 305	t.current = trimmed
 306}
 307
 308// all prepare methods
 309
 310func (t *bruteForceMatchTree) prepare(doc uint32) {
 311	t.docID = doc
 312	t.firstDone = true
 313}
 314
 315func (t *docMatchTree) prepare(doc uint32) {
 316	t.docID = doc
 317	t.firstDone = true
 318}
 319
 320func (t *andMatchTree) prepare(doc uint32) {
 321	for _, c := range t.children {
 322		c.prepare(doc)
 323	}
 324}
 325
 326func (t *regexpMatchTree) prepare(doc uint32) {
 327	t.found = t.found[:0]
 328	t.reEvaluated = false
 329	t.bruteForceMatchTree.prepare(doc)
 330}
 331
 332func (t *wordMatchTree) prepare(doc uint32) {
 333	t.found = t.found[:0]
 334	t.evaluated = false
 335	t.bruteForceMatchTree.prepare(doc)
 336}
 337
 338func (t *orMatchTree) prepare(doc uint32) {
 339	for _, c := range t.children {
 340		c.prepare(doc)
 341	}
 342}
 343
 344func (t *notMatchTree) prepare(doc uint32) {
 345	t.child.prepare(doc)
 346}
 347
 348func (t *fileNameMatchTree) prepare(doc uint32) {
 349	t.child.prepare(doc)
 350}
 351
 352func (t *substrMatchTree) prepare(nextDoc uint32) {
 353	t.matchIterator.prepare(nextDoc)
 354	t.current = t.matchIterator.candidates()
 355	t.contEvaluated = false
 356}
 357
 358func (t *branchQueryMatchTree) prepare(doc uint32) {
 359	t.firstDone = true
 360	t.docID = doc
 361}
 362
 363// nextDoc
 364
 365func (t *docMatchTree) nextDoc() uint32 {
 366	var start uint32
 367	if t.firstDone {
 368		start = t.docID + 1
 369	}
 370	for i := start; i < t.numDocs; i++ {
 371		if t.predicate(i) {
 372			return i
 373		}
 374	}
 375	return maxUInt32
 376}
 377
 378func (t *bruteForceMatchTree) nextDoc() uint32 {
 379	if !t.firstDone {
 380		return 0
 381	}
 382	return t.docID + 1
 383}
 384
 385func (t *andMatchTree) nextDoc() uint32 {
 386	var max uint32
 387	for _, c := range t.children {
 388		m := c.nextDoc()
 389		if m > max {
 390			max = m
 391		}
 392	}
 393	return max
 394}
 395
 396func (t *orMatchTree) nextDoc() uint32 {
 397	min := uint32(maxUInt32)
 398	for _, c := range t.children {
 399		m := c.nextDoc()
 400		if m < min {
 401			min = m
 402		}
 403	}
 404	return min
 405}
 406
 407func (t *notMatchTree) nextDoc() uint32 {
 408	return 0
 409}
 410
 411func (t *fileNameMatchTree) nextDoc() uint32 {
 412	return t.child.nextDoc()
 413}
 414
 415func (t *branchQueryMatchTree) nextDoc() uint32 {
 416	var start uint32
 417	if t.firstDone {
 418		start = t.docID + 1
 419	}
 420
 421	for i := start; i < uint32(len(t.fileMasks)); i++ {
 422		if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 {
 423			return i
 424		}
 425	}
 426	return maxUInt32
 427}
 428
 429// all String methods
 430
 431func (t *bruteForceMatchTree) String() string {
 432	return "all"
 433}
 434
 435func (t *docMatchTree) String() string {
 436	return fmt.Sprintf("doc(%s)", t.reason)
 437}
 438
 439func (t *andMatchTree) String() string {
 440	return fmt.Sprintf("and%v", t.children)
 441}
 442
 443func (t *regexpMatchTree) String() string {
 444	f := ""
 445	if t.fileName {
 446		f = "f"
 447	}
 448	return fmt.Sprintf("%sre(%s)", f, t.regexp)
 449}
 450
 451func (t *wordMatchTree) String() string {
 452	f := ""
 453	if t.fileName {
 454		f = "f"
 455	}
 456	return fmt.Sprintf("%sword(%s)", f, t.word)
 457}
 458
 459func (t *orMatchTree) String() string {
 460	return fmt.Sprintf("or%v", t.children)
 461}
 462
 463func (t *notMatchTree) String() string {
 464	return fmt.Sprintf("not(%v)", t.child)
 465}
 466
 467func (t *noVisitMatchTree) String() string {
 468	return fmt.Sprintf("novisit(%v)", t.matchTree)
 469}
 470
 471func (t *fileNameMatchTree) String() string {
 472	return fmt.Sprintf("f(%v)", t.child)
 473}
 474
 475func (t *substrMatchTree) String() string {
 476	f := ""
 477	if t.fileName {
 478		f = "f"
 479	}
 480
 481	return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator)
 482}
 483
 484func (t *branchQueryMatchTree) String() string {
 485	return fmt.Sprintf("branch(%x)", t.masks)
 486}
 487
 488func (t *symbolSubstrMatchTree) String() string {
 489	return fmt.Sprintf("symbol(%v)", t.substrMatchTree)
 490}
 491
 492func (t *symbolRegexpMatchTree) String() string {
 493	return fmt.Sprintf("symbol(%v)", t.matchTree)
 494}
 495
 496// visitMatches visits all atoms in matchTree. Note: This visits
 497// noVisitMatchTree. For collecting matches use visitMatches.
 498func visitMatchTree(t matchTree, f func(matchTree)) {
 499	switch s := t.(type) {
 500	case *andMatchTree:
 501		for _, ch := range s.children {
 502			visitMatchTree(ch, f)
 503		}
 504	case *orMatchTree:
 505		for _, ch := range s.children {
 506			visitMatchTree(ch, f)
 507		}
 508	case *andLineMatchTree:
 509		visitMatchTree(&s.andMatchTree, f)
 510	case *noVisitMatchTree:
 511		visitMatchTree(s.matchTree, f)
 512	case *notMatchTree:
 513		visitMatchTree(s.child, f)
 514	case *fileNameMatchTree:
 515		visitMatchTree(s.child, f)
 516	case *symbolSubstrMatchTree:
 517		visitMatchTree(s.substrMatchTree, f)
 518	case *symbolRegexpMatchTree:
 519		visitMatchTree(s.matchTree, f)
 520	default:
 521		f(t)
 522	}
 523}
 524
 525// updateMatchTreeStats calls updateStats on all atoms in mt which have that
 526// function defined.
 527func updateMatchTreeStats(mt matchTree, stats *Stats) {
 528	visitMatchTree(mt, func(mt matchTree) {
 529		if atom, ok := mt.(interface{ updateStats(*Stats) }); ok {
 530			atom.updateStats(stats)
 531		}
 532	})
 533}
 534
 535// visitMatches visits all atoms which can contribute matches. Note: This
 536// skips noVisitMatchTree.
 537func visitMatches(t matchTree, known map[matchTree]bool, f func(matchTree)) {
 538	switch s := t.(type) {
 539	case *andMatchTree:
 540		for _, ch := range s.children {
 541			if known[ch] {
 542				visitMatches(ch, known, f)
 543			}
 544		}
 545	case *andLineMatchTree:
 546		visitMatches(&s.andMatchTree, known, f)
 547	case *orMatchTree:
 548		for _, ch := range s.children {
 549			if known[ch] {
 550				visitMatches(ch, known, f)
 551			}
 552		}
 553	case *symbolSubstrMatchTree:
 554		visitMatches(s.substrMatchTree, known, f)
 555	case *notMatchTree:
 556	case *noVisitMatchTree:
 557		// don't collect into negative trees.
 558	case *fileNameMatchTree:
 559		// We will just gather the filename if we do not visit this tree.
 560	default:
 561		f(s)
 562	}
 563}
 564
 565// all matches() methods.
 566
 567func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 568	return t.predicate(cp.idx), true
 569}
 570
 571func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 572	return true, true
 573}
 574
 575// andLineMatchTree is a performance optimization of andMatchTree. For content
 576// searches we don't want to run the regex engine if there is no line that
 577// contains matches from all terms.
 578func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 579	matches, sure := t.andMatchTree.matches(cp, cost, known)
 580	if !(sure && matches) {
 581		return matches, sure
 582	}
 583
 584	// find child with fewest candidates
 585	min := maxUInt32
 586	fewestChildren := 0
 587	for ix, child := range t.children {
 588		v, ok := child.(*substrMatchTree)
 589		// make sure we are running a content search and that all candidates are a
 590		// substrMatchTree
 591		if !ok || v.fileName {
 592			return matches, sure
 593		}
 594		if len(v.current) < min {
 595			min = len(v.current)
 596			fewestChildren = ix
 597		}
 598	}
 599
 600	type lineRange struct {
 601		start int
 602		end   int
 603	}
 604	lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current))
 605	prev := -1
 606	for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current {
 607		line, byteStart, byteEnd := cp.newlines().atOffset(candidate.byteOffset)
 608		if line == prev {
 609			continue
 610		}
 611		prev = line
 612		lines = append(lines, lineRange{byteStart, byteEnd})
 613	}
 614
 615	// children keeps track of the children's candidates we have already seen.
 616	children := make([][]*candidateMatch, 0, len(t.children)-1)
 617	for j, child := range t.children {
 618		if j == fewestChildren {
 619			continue
 620		}
 621		children = append(children, child.(*substrMatchTree).current)
 622	}
 623
 624nextLine:
 625	for i := 0; i < len(lines); i++ {
 626		hits := 1
 627	nextChild:
 628		for j := range children {
 629		nextCandidate:
 630			for len(children[j]) > 0 {
 631				candidate := children[j][0]
 632				bo := int(cp.findOffset(false, candidate.runeOffset))
 633				if bo < lines[i].start {
 634					children[j] = children[j][1:]
 635					continue nextCandidate
 636				}
 637				if bo <= lines[i].end {
 638					hits++
 639					continue nextChild
 640				}
 641				// move the `lines` iterator forward until bo <= line.end
 642				for i < len(lines) && bo > lines[i].end {
 643					i++
 644				}
 645				i--
 646				continue nextLine
 647			}
 648		}
 649		// return early once we found any line that contains matches from all children
 650		if hits == len(t.children) {
 651			return matches, true
 652		}
 653	}
 654	return false, true
 655}
 656
 657func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 658	sure := true
 659
 660	for _, ch := range t.children {
 661		v, ok := evalMatchTree(cp, cost, known, ch)
 662		if ok && !v {
 663			return false, true
 664		}
 665		if !ok {
 666			sure = false
 667		}
 668	}
 669
 670	return true, sure
 671}
 672
 673func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 674	matches := false
 675	sure := true
 676	for _, ch := range t.children {
 677		v, ok := evalMatchTree(cp, cost, known, ch)
 678		if ok {
 679			// we could short-circuit, but we want to use
 680			// the other possibilities as a ranking
 681			// signal.
 682			matches = matches || v
 683		} else {
 684			sure = false
 685		}
 686	}
 687	return matches, sure
 688}
 689
 690func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 691	return t.branchMask() != 0, true
 692}
 693
 694func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 695	if t.reEvaluated {
 696		return len(t.found) > 0, true
 697	}
 698
 699	if cost < costRegexp {
 700		return false, false
 701	}
 702
 703	cp.stats.RegexpsConsidered++
 704	idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1)
 705	found := t.found[:0]
 706	for _, idx := range idxs {
 707		cm := &candidateMatch{
 708			byteOffset:  uint32(idx[0]),
 709			byteMatchSz: uint32(idx[1] - idx[0]),
 710			fileName:    t.fileName,
 711		}
 712
 713		found = append(found, cm)
 714	}
 715	t.found = found
 716	t.reEvaluated = true
 717
 718	return len(t.found) > 0, true
 719}
 720
 721func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 722	if t.evaluated {
 723		return len(t.found) > 0, true
 724	}
 725
 726	if cost < costRegexp {
 727		return false, false
 728	}
 729
 730	data := cp.data(t.fileName)
 731	offset := 0
 732	found := t.found[:0]
 733	for {
 734		idx := bytes.Index(data[offset:], []byte(t.word))
 735		if idx < 0 {
 736			break
 737		}
 738
 739		relStartOffset := offset + idx
 740		relEndOffset := relStartOffset + len(t.word)
 741
 742		startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1]))
 743		endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset]))
 744		if startBoundary && endBoundary {
 745			found = append(found, &candidateMatch{
 746				byteOffset:  uint32(offset + idx),
 747				byteMatchSz: uint32(len(t.word)),
 748				fileName:    t.fileName,
 749			})
 750		}
 751		offset += idx + len(t.word)
 752	}
 753
 754	t.found = found
 755	t.evaluated = true
 756
 757	return len(t.found) > 0, true
 758}
 759
 760// breakMatchesOnNewlines returns matches resulting from breaking each element
 761// of cms on newlines within text.
 762func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch {
 763	var lineCMs []*candidateMatch
 764	for _, cm := range cms {
 765		lineCMs = append(lineCMs, breakOnNewlines(cm, text)...)
 766	}
 767	return lineCMs
 768}
 769
 770// breakOnNewlines returns matches resulting from breaking cm on newlines
 771// within text.
 772func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch {
 773	var cms []*candidateMatch
 774	addMe := &candidateMatch{}
 775	*addMe = *cm
 776	for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ {
 777		if text[i] == '\n' {
 778			addMe.byteMatchSz = i - addMe.byteOffset
 779			if addMe.byteMatchSz != 0 {
 780				cms = append(cms, addMe)
 781			}
 782
 783			addMe = &candidateMatch{}
 784			*addMe = *cm
 785			addMe.byteOffset = i + 1
 786		}
 787	}
 788	addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset
 789	if addMe.byteMatchSz != 0 {
 790		cms = append(cms, addMe)
 791	}
 792	return cms
 793}
 794
 795func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) (bool, bool) {
 796	if v, ok := known[mt]; ok {
 797		return v, true
 798	}
 799
 800	v, ok := mt.matches(cp, cost, known)
 801	if ok {
 802		known[mt] = v
 803	}
 804
 805	return v, ok
 806}
 807
 808func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 809	v, ok := evalMatchTree(cp, cost, known, t.child)
 810	return !v, ok
 811}
 812
 813func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 814	return evalMatchTree(cp, cost, known, t.child)
 815}
 816
 817func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) (bool, bool) {
 818	if t.contEvaluated {
 819		return len(t.current) > 0, true
 820	}
 821
 822	if len(t.current) == 0 {
 823		return false, true
 824	}
 825
 826	if t.fileName && cost < costMemory {
 827		return false, false
 828	}
 829
 830	if !t.fileName && cost < costContent {
 831		return false, false
 832	}
 833
 834	pruned := t.current[:0]
 835	for _, m := range t.current {
 836		if m.byteOffset == 0 && m.runeOffset > 0 {
 837			m.byteOffset = cp.findOffset(m.fileName, m.runeOffset)
 838		}
 839		if m.matchContent(cp.data(m.fileName)) {
 840			pruned = append(pruned, m)
 841		}
 842	}
 843	t.current = pruned
 844	t.contEvaluated = true
 845
 846	return len(t.current) > 0, true
 847}
 848
 849type matchTreeOpt struct {
 850	// DisableWordMatchOptimization is used to disable the use of wordMatchTree.
 851	// This was added since we do not support wordMatchTree with symbol search.
 852	DisableWordMatchOptimization bool
 853}
 854
 855func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) {
 856	if q == nil {
 857		return nil, fmt.Errorf("got nil (sub)query")
 858	}
 859	switch s := q.(type) {
 860	case *query.Regexp:
 861		// RegexpToMatchTreeRecursive tries to distill a matchTree that matches a
 862		// superset of the regexp. If the returned matchTree is equivalent to the
 863		// original regexp, it returns true. An equivalent matchTree has the same
 864		// behaviour as the original regexp and can be used instead.
 865		//
 866		subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive)
 867		if err != nil {
 868			return nil, err
 869		}
 870		// if the query can be used in place of the regexp
 871		// return the subtree
 872		if isEq {
 873			return subMT, nil
 874		}
 875
 876		var tr matchTree
 877		if wmt, ok := regexpToWordMatchTree(s, opt); ok {
 878			// A common search we get is "\bLITERAL\b". Avoid the regex engine and
 879			// provide something faster.
 880			tr = wmt
 881		} else {
 882			prefix := ""
 883			if !s.CaseSensitive {
 884				prefix = "(?i)"
 885			}
 886
 887			tr = &regexpMatchTree{
 888				regexp:   regexp.MustCompile(prefix + s.Regexp.String()),
 889				fileName: s.FileName,
 890			}
 891		}
 892
 893		return &andMatchTree{
 894			children: []matchTree{
 895				tr, &noVisitMatchTree{subMT},
 896			},
 897		}, nil
 898	case *query.And:
 899		var r []matchTree
 900		for _, ch := range s.Children {
 901			ct, err := d.newMatchTree(ch, opt)
 902			if err != nil {
 903				return nil, err
 904			}
 905			r = append(r, ct)
 906		}
 907		return &andMatchTree{r}, nil
 908	case *query.Or:
 909		var r []matchTree
 910		for _, ch := range s.Children {
 911			ct, err := d.newMatchTree(ch, opt)
 912			if err != nil {
 913				return nil, err
 914			}
 915			r = append(r, ct)
 916		}
 917		return &orMatchTree{r}, nil
 918	case *query.Not:
 919		ct, err := d.newMatchTree(s.Child, opt)
 920		return &notMatchTree{
 921			child: ct,
 922		}, err
 923
 924	case *query.Type:
 925		if s.Type != query.TypeFileName {
 926			break
 927		}
 928
 929		ct, err := d.newMatchTree(s.Child, opt)
 930		if err != nil {
 931			return nil, err
 932		}
 933
 934		return &fileNameMatchTree{
 935			child: ct,
 936		}, nil
 937
 938	case *query.Substring:
 939		return d.newSubstringMatchTree(s)
 940
 941	case *query.Branch:
 942		masks := make([]uint64, 0, len(d.repoMetaData))
 943		if s.Pattern == "HEAD" {
 944			for i := 0; i < len(d.repoMetaData); i++ {
 945				masks = append(masks, 1)
 946			}
 947		} else {
 948			for _, branchIDs := range d.branchIDs {
 949				mask := uint64(0)
 950				for nm, m := range branchIDs {
 951					if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) {
 952						mask |= uint64(m)
 953					}
 954				}
 955				masks = append(masks, mask)
 956			}
 957
 958		}
 959		return &branchQueryMatchTree{
 960			masks:     masks,
 961			fileMasks: d.fileBranchMasks,
 962			repos:     d.repos,
 963		}, nil
 964	case *query.Const:
 965		if s.Value {
 966			return &bruteForceMatchTree{}, nil
 967		} else {
 968			return &noMatchTree{Why: "const"}, nil
 969		}
 970	case *query.Language:
 971		code, ok := d.metaData.LanguageMap[s.Language]
 972		if !ok {
 973			return &noMatchTree{Why: "lang"}, nil
 974		}
 975		return &docMatchTree{
 976			reason:  "language",
 977			numDocs: d.numDocs(),
 978			predicate: func(docID uint32) bool {
 979				return d.getLanguage(docID) == code
 980			},
 981		}, nil
 982
 983	case *query.Symbol:
 984		// Disable WordMatchTree since we don't support it in symbols yet.
 985		optCopy := opt
 986		optCopy.DisableWordMatchOptimization = true
 987
 988		subMT, err := d.newMatchTree(s.Expr, optCopy)
 989		if err != nil {
 990			return nil, err
 991		}
 992
 993		if substr, ok := subMT.(*substrMatchTree); ok {
 994			return &symbolSubstrMatchTree{
 995				substrMatchTree: substr,
 996				patternSize:     uint32(utf8.RuneCountInString(substr.query.Pattern)),
 997				fileEndRunes:    d.fileEndRunes,
 998				fileEndSymbol:   d.fileEndSymbol,
 999				sections:        d.runeDocSections,
1000			}, nil
1001		}
1002
1003		var regexp *regexp.Regexp
1004		visitMatchTree(subMT, func(mt matchTree) {
1005			if t, ok := mt.(*regexpMatchTree); ok {
1006				regexp = t.regexp
1007			}
1008		})
1009		if regexp == nil {
1010			return nil, fmt.Errorf("found %T inside query.Symbol", subMT)
1011		}
1012
1013		return &symbolRegexpMatchTree{
1014			regexp:    regexp,
1015			all:       regexp.String() == "(?i)(?-s:.)*",
1016			matchTree: subMT,
1017		}, nil
1018
1019	case *query.FileNameSet:
1020		return &docMatchTree{
1021			reason:  "FileNameSet",
1022			numDocs: d.numDocs(),
1023			predicate: func(docID uint32) bool {
1024				fileName := d.fileName(docID)
1025				_, ok := s.Set[string(fileName)]
1026				return ok
1027			},
1028		}, nil
1029
1030	case *query.BranchesRepos:
1031		reposBranchesWant := make([]uint64, len(d.repoMetaData))
1032		for repoIdx := range d.repoMetaData {
1033			var mask uint64
1034			for _, br := range s.List {
1035				if br.Repos.Contains(d.repoMetaData[repoIdx].ID) {
1036					mask |= uint64(d.branchIDs[repoIdx][br.Branch])
1037				}
1038			}
1039			reposBranchesWant[repoIdx] = mask
1040		}
1041		return &docMatchTree{
1042			reason:  "BranchesRepos",
1043			numDocs: d.numDocs(),
1044			predicate: func(docID uint32) bool {
1045				return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0
1046			},
1047		}, nil
1048
1049	case *query.RepoSet:
1050		reposWant := make([]bool, len(d.repoMetaData))
1051		for repoIdx, r := range d.repoMetaData {
1052			if _, ok := s.Set[r.Name]; ok {
1053				reposWant[repoIdx] = true
1054			}
1055		}
1056		return &docMatchTree{
1057			reason:  "RepoSet",
1058			numDocs: d.numDocs(),
1059			predicate: func(docID uint32) bool {
1060				return reposWant[d.repos[docID]]
1061			},
1062		}, nil
1063
1064	case *query.RepoIDs:
1065		reposWant := make([]bool, len(d.repoMetaData))
1066		for repoIdx, r := range d.repoMetaData {
1067			if s.Repos.Contains(r.ID) {
1068				reposWant[repoIdx] = true
1069			}
1070		}
1071		return &docMatchTree{
1072			reason:  "RepoIDs",
1073			numDocs: d.numDocs(),
1074			predicate: func(docID uint32) bool {
1075				return reposWant[d.repos[docID]]
1076			},
1077		}, nil
1078
1079	case *query.Repo:
1080		reposWant := make([]bool, len(d.repoMetaData))
1081		for repoIdx, r := range d.repoMetaData {
1082			if s.Regexp.MatchString(r.Name) {
1083				reposWant[repoIdx] = true
1084			}
1085		}
1086		return &docMatchTree{
1087			reason:  "Repo",
1088			numDocs: d.numDocs(),
1089			predicate: func(docID uint32) bool {
1090				return reposWant[d.repos[docID]]
1091			},
1092		}, nil
1093
1094	case *query.RepoRegexp:
1095		reposWant := make([]bool, len(d.repoMetaData))
1096		for repoIdx, r := range d.repoMetaData {
1097			if s.Regexp.MatchString(r.Name) {
1098				reposWant[repoIdx] = true
1099			}
1100		}
1101		return &docMatchTree{
1102			reason:  "RepoRegexp",
1103			numDocs: d.numDocs(),
1104			predicate: func(docID uint32) bool {
1105				return reposWant[d.repos[docID]]
1106			},
1107		}, nil
1108
1109	case query.RawConfig:
1110		return &docMatchTree{
1111			reason:  s.String(),
1112			numDocs: d.numDocs(),
1113			predicate: func(docID uint32) bool {
1114				return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s)
1115			},
1116		}, nil
1117	}
1118	log.Panicf("type %T", q)
1119	return nil, nil
1120}
1121
1122func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) {
1123	st := &substrMatchTree{
1124		query:         s,
1125		caseSensitive: s.CaseSensitive,
1126		fileName:      s.FileName,
1127	}
1128
1129	if utf8.RuneCountInString(s.Pattern) < ngramSize {
1130		prefix := ""
1131		if !s.CaseSensitive {
1132			prefix = "(?i)"
1133		}
1134		t := &regexpMatchTree{
1135			regexp:   regexp.MustCompile(prefix + regexp.QuoteMeta(s.Pattern)),
1136			fileName: s.FileName,
1137		}
1138		return t, nil
1139	}
1140
1141	result, err := d.iterateNgrams(s)
1142	if err != nil {
1143		return nil, err
1144	}
1145	st.matchIterator = result
1146	return st, nil
1147}
1148
1149func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) {
1150	if opt.DisableWordMatchOptimization {
1151		return nil, false
1152	}
1153	// Needs to be case sensitive
1154	if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 {
1155		return nil, false
1156	}
1157	// We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary]
1158	if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 {
1159		return nil, false
1160	}
1161	sub := q.Regexp.Sub
1162	if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary {
1163		return nil, false
1164	}
1165
1166	return &wordMatchTree{
1167		word:     string(sub[1].Rune),
1168		fileName: q.FileName,
1169	}, true
1170}
1171
1172// pruneMatchTree removes impossible branches from the matchTree, as indicated
1173// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth.
1174func pruneMatchTree(mt matchTree) (matchTree, error) {
1175	var err error
1176	switch mt := mt.(type) {
1177	// leaf nodes that we test with the filter:
1178	case *substrMatchTree:
1179		if res, ok := mt.matchIterator.(*ngramIterationResults); ok {
1180			if _, ok := res.matchIterator.(*noMatchTree); ok {
1181				return nil, nil
1182			}
1183		}
1184	// recursive tree structures:
1185	case *andMatchTree:
1186		// Any branch of an and becoming impossible means the entire clause
1187		// is impossible. Otherwise, just handle rewrites.
1188		for i, child := range mt.children {
1189			newChild, err := pruneMatchTree(child)
1190			if err != nil {
1191				return nil, err
1192			}
1193			if newChild == nil {
1194				return nil, nil
1195			}
1196			mt.children[i] = newChild
1197		}
1198	case *orMatchTree:
1199		// *All* branches of an OR becoming impossible means the entire clause
1200		// is impossible. Otherwise, drop impossible subclauses and handle
1201		// rewrites, including simplifying to a singular resulting child branch.
1202		n := 0
1203		for _, child := range mt.children {
1204			newChild, err := pruneMatchTree(child)
1205			if err != nil {
1206				return nil, err
1207			}
1208			if newChild != nil {
1209				mt.children[n] = newChild
1210				n++
1211			}
1212		}
1213		mt.children = mt.children[:n]
1214		if len(mt.children) == 1 {
1215			return mt.children[0], nil
1216		} else if len(mt.children) == 0 {
1217			return nil, nil
1218		}
1219	case *noVisitMatchTree:
1220		mt.matchTree, err = pruneMatchTree(mt.matchTree)
1221		if err != nil {
1222			return nil, err
1223		}
1224		if mt.matchTree == nil {
1225			return nil, nil
1226		}
1227	case *fileNameMatchTree:
1228		mt.child, err = pruneMatchTree(mt.child)
1229	case *andLineMatchTree:
1230		child, err := pruneMatchTree(&mt.andMatchTree)
1231		if err != nil {
1232			return nil, err
1233		}
1234		if child == nil {
1235			return nil, nil
1236		}
1237		if c, ok := child.(*andMatchTree); ok {
1238			mt.andMatchTree = *c
1239		} else {
1240			// the and was simplified to a single clause,
1241			// so the linematch portion is irrelevant.
1242			return mt, nil
1243		}
1244	case *notMatchTree:
1245		mt.child, err = pruneMatchTree(mt.child)
1246		if err != nil {
1247			return nil, err
1248		}
1249		if mt.child == nil {
1250			// not false => true
1251			return &bruteForceMatchTree{}, nil
1252		}
1253	// unhandled:
1254	case *docMatchTree:
1255	case *bruteForceMatchTree:
1256	case *regexpMatchTree:
1257	case *wordMatchTree:
1258	}
1259	return mt, err
1260}
Configure Feed

Configure Feed