matchtree.go at 59ab9491e1337dda8f1123048dce0ecd2877fa8c · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / matchtree.go
at 59ab9491e1337dda8f1123048dce0ecd2877fa8c 34 kB View raw
Keegan Carruthers-Smith all: use a faster vendored regexp/syntax/Regexp.String (#753) 2y ago
   1// Copyright 2018 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15package zoekt
  16
  17import (
  18	"bytes"
  19	"fmt"
  20	"log"
  21	"regexp/syntax"
  22	"strings"
  23	"unicode/utf8"
  24
  25	"github.com/grafana/regexp"
  26
  27	"github.com/sourcegraph/zoekt/internal/syntaxutil"
  28	"github.com/sourcegraph/zoekt/query"
  29)
  30
  31// A docIterator iterates over documents in order.
  32type docIterator interface {
  33	// provide the next document where we may find something interesting.
  34	//
  35	// This is like a "peek" and shouldn't mutate state. prepare is what should
  36	// change state.
  37	nextDoc() uint32
  38
  39	// clears any per-document state of the docIterator, and
  40	// prepares for evaluating the given doc. The argument is
  41	// strictly increasing over time.
  42	prepare(nextDoc uint32)
  43}
  44
  45// costs are passed in increasing order to matchTree.matches until they do not
  46// return matchesRequiresHigherCost.
  47const (
  48	costConst   = 0
  49	costMemory  = 1
  50	costContent = 2
  51	costRegexp  = 3
  52)
  53
  54const (
  55	costMin = costConst
  56	costMax = costRegexp
  57)
  58
  59// matchesState is an enum for the state of a matchTree after a call to
  60// matchTree.matches.
  61type matchesState uint8
  62
  63const (
  64	// matchesRequiresHigherCost is returned when matchTree.matches hasn't done
  65	// a search yet since the cost value is not high enough.
  66	matchesRequiresHigherCost matchesState = iota
  67
  68	// matchesFound is returned when matchTree.matches has done a search and
  69	// found one or more matches.
  70	matchesFound
  71
  72	// matchesNone is returned when matchTree.matches has done a search and
  73	// found nothing.
  74	matchesNone
  75)
  76
  77// matchesStatePred is a helper which returns matchesFound if b is true
  78// otherwise returns matchesNone.
  79func matchesStatePred(b bool) matchesState {
  80	if b {
  81		return matchesFound
  82	}
  83	return matchesNone
  84}
  85
  86// matchesStateForSlice is a helper which returns matchesFound if v is
  87// non-empty otherwise returns matchesNone.
  88func matchesStateForSlice[T any](v []T) matchesState {
  89	return matchesStatePred(len(v) > 0)
  90}
  91
  92// An expression tree coupled with matches. The matchtree has two
  93// functions:
  94//
  95// * it implements boolean combinations (and, or, not)
  96//
  97// * it implements shortcuts, where we skip documents (for example: if
  98// there are no trigram matches, we can be sure there are no substring
  99// matches). The matchtree iterates over the documents as they are
 100// ordered in the shard.
 101//
 102// The general process for a given (shard, query) is:
 103//
 104// - construct matchTree for the query
 105//
 106// - find all different leaf matchTrees (substring, regexp, etc.)
 107//
 108// in a loop:
 109//
 110//   - find next doc to process using nextDoc
 111//
 112//   - evaluate atoms (leaf expressions that match text)
 113//
 114//   - evaluate the tree using matches(), storing the result in map.
 115//
 116//   - if the complete tree returns (matches() != matchesRequiresHigherCost)
 117//     for the document, collect all text matches by looking at leaf
 118//     matchTrees.
 119type matchTree interface {
 120	docIterator
 121
 122	// matches if cost is high enough. See documentation for matchesState's
 123	// values.
 124	//
 125	// Note: Do not call this directly, rather use evalMatchTree which uses
 126	// known to cache responses once the state transitions away from
 127	// matchesRequiresHigherCost.
 128	matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState
 129}
 130
 131// docMatchTree iterates over documents for which predicate(docID) returns true.
 132type docMatchTree struct {
 133	// the number of documents in a shard.
 134	numDocs uint32
 135
 136	predicate func(docID uint32) bool
 137
 138	// provides additional information about the reason why the docMatchTree was
 139	// created.
 140	reason string
 141
 142	// mutable
 143	firstDone bool
 144	docID     uint32
 145}
 146
 147type bruteForceMatchTree struct {
 148	// mutable
 149	firstDone bool
 150	docID     uint32
 151}
 152
 153type andLineMatchTree struct {
 154	andMatchTree
 155}
 156
 157type andMatchTree struct {
 158	children []matchTree
 159}
 160
 161type orMatchTree struct {
 162	children []matchTree
 163}
 164
 165type notMatchTree struct {
 166	child matchTree
 167}
 168
 169// Returns only the filename of child matches.
 170type fileNameMatchTree struct {
 171	child matchTree
 172}
 173
 174type boostMatchTree struct {
 175	child matchTree
 176	boost float64
 177}
 178
 179// Don't visit this subtree for collecting matches.
 180type noVisitMatchTree struct {
 181	matchTree
 182}
 183
 184type regexpMatchTree struct {
 185	regexp *regexp.Regexp
 186
 187	// origRegexp is the original parsed regexp from the query structure. It
 188	// does not include mutations such as case sensitivity.
 189	origRegexp *syntax.Regexp
 190
 191	fileName bool
 192
 193	// mutable
 194	reEvaluated bool
 195	found       []*candidateMatch
 196
 197	// nextDoc, prepare.
 198	bruteForceMatchTree
 199}
 200
 201func newRegexpMatchTree(s *query.Regexp) *regexpMatchTree {
 202	prefix := ""
 203	if !s.CaseSensitive {
 204		prefix = "(?i)"
 205	}
 206
 207	return &regexpMatchTree{
 208		regexp:     regexp.MustCompile(prefix + syntaxutil.RegexpString(s.Regexp)),
 209		origRegexp: s.Regexp,
 210		fileName:   s.FileName,
 211	}
 212}
 213
 214// \bLITERAL\b
 215type wordMatchTree struct {
 216	word string
 217
 218	fileName bool
 219
 220	// mutable
 221	evaluated bool
 222	found     []*candidateMatch
 223
 224	// nextDoc, prepare.
 225	bruteForceMatchTree
 226}
 227
 228type substrMatchTree struct {
 229	matchIterator
 230
 231	query         *query.Substring
 232	caseSensitive bool
 233	fileName      bool
 234
 235	// mutable
 236	current       []*candidateMatch
 237	contEvaluated bool
 238}
 239
 240type branchQueryMatchTree struct {
 241	fileMasks []uint64
 242	masks     []uint64
 243	repos     []uint16
 244
 245	// mutable
 246	firstDone bool
 247	docID     uint32
 248}
 249
 250func (t *branchQueryMatchTree) branchMask() uint64 {
 251	return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]]
 252}
 253
 254type symbolRegexpMatchTree struct {
 255	matchTree
 256	regexp *regexp.Regexp
 257	all    bool // skips regex match if .*
 258
 259	reEvaluated bool
 260	found       []*candidateMatch
 261}
 262
 263func (t *symbolRegexpMatchTree) prepare(doc uint32) {
 264	t.reEvaluated = false
 265	t.found = t.found[:0]
 266	t.matchTree.prepare(doc)
 267}
 268
 269func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 270	if t.reEvaluated {
 271		return matchesStateForSlice(t.found)
 272	}
 273
 274	if cost < costRegexp {
 275		return matchesRequiresHigherCost
 276	}
 277
 278	sections := cp.docSections()
 279	content := cp.data(false)
 280
 281	found := t.found[:0]
 282	for i, sec := range sections {
 283		var idx []int
 284		if t.all {
 285			idx = []int{0, int(sec.End - sec.Start)}
 286		} else {
 287			idx = t.regexp.FindIndex(content[sec.Start:sec.End])
 288			if idx == nil {
 289				continue
 290			}
 291		}
 292
 293		cm := &candidateMatch{
 294			byteOffset:  sec.Start + uint32(idx[0]),
 295			byteMatchSz: uint32(idx[1] - idx[0]),
 296			symbol:      true,
 297			symbolIdx:   uint32(i),
 298		}
 299		found = append(found, cm)
 300	}
 301	t.found = found
 302	t.reEvaluated = true
 303
 304	return matchesStateForSlice(t.found)
 305}
 306
 307type symbolSubstrMatchTree struct {
 308	*substrMatchTree
 309
 310	patternSize   uint32
 311	fileEndRunes  []uint32
 312	fileEndSymbol []uint32
 313
 314	doc      uint32
 315	sections []DocumentSection
 316
 317	secID uint32
 318}
 319
 320func (t *symbolSubstrMatchTree) prepare(doc uint32) {
 321	t.substrMatchTree.prepare(doc)
 322	t.doc = doc
 323
 324	var fileStart uint32
 325	if doc > 0 {
 326		fileStart = t.fileEndRunes[doc-1]
 327	}
 328
 329	var sections []DocumentSection
 330	if len(t.sections) > 0 {
 331		most := t.fileEndSymbol[len(t.fileEndSymbol)-1]
 332		if most == uint32(len(t.sections)) {
 333			sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]]
 334		} else {
 335			for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart {
 336				t.secID++
 337			}
 338
 339			fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID
 340			for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd {
 341				symbolEnd++
 342			}
 343
 344			sections = t.sections[t.secID:symbolEnd]
 345		}
 346	}
 347
 348	secIdx := 0
 349	trimmed := t.current[:0]
 350	for len(sections) > secIdx && len(t.current) > 0 {
 351		start := fileStart + t.current[0].runeOffset
 352		end := start + t.patternSize
 353		if start >= sections[secIdx].End {
 354			secIdx++
 355			continue
 356		}
 357
 358		if start < sections[secIdx].Start {
 359			t.current = t.current[1:]
 360			continue
 361		}
 362
 363		if end <= sections[secIdx].End {
 364			t.current[0].symbol = true
 365			t.current[0].symbolIdx = uint32(secIdx)
 366			trimmed = append(trimmed, t.current[0])
 367		}
 368
 369		t.current = t.current[1:]
 370	}
 371	t.current = trimmed
 372}
 373
 374// all prepare methods
 375
 376func (t *bruteForceMatchTree) prepare(doc uint32) {
 377	t.docID = doc
 378	t.firstDone = true
 379}
 380
 381func (t *docMatchTree) prepare(doc uint32) {
 382	t.docID = doc
 383	t.firstDone = true
 384}
 385
 386func (t *andMatchTree) prepare(doc uint32) {
 387	for _, c := range t.children {
 388		c.prepare(doc)
 389	}
 390}
 391
 392func (t *regexpMatchTree) prepare(doc uint32) {
 393	t.found = t.found[:0]
 394	t.reEvaluated = false
 395	t.bruteForceMatchTree.prepare(doc)
 396}
 397
 398func (t *wordMatchTree) prepare(doc uint32) {
 399	t.found = t.found[:0]
 400	t.evaluated = false
 401	t.bruteForceMatchTree.prepare(doc)
 402}
 403
 404func (t *orMatchTree) prepare(doc uint32) {
 405	for _, c := range t.children {
 406		c.prepare(doc)
 407	}
 408}
 409
 410func (t *notMatchTree) prepare(doc uint32) {
 411	t.child.prepare(doc)
 412}
 413
 414func (t *fileNameMatchTree) prepare(doc uint32) {
 415	t.child.prepare(doc)
 416}
 417
 418func (t *boostMatchTree) prepare(doc uint32) {
 419	t.child.prepare(doc)
 420}
 421
 422func (t *substrMatchTree) prepare(nextDoc uint32) {
 423	t.matchIterator.prepare(nextDoc)
 424	t.current = t.matchIterator.candidates()
 425	t.contEvaluated = false
 426}
 427
 428func (t *branchQueryMatchTree) prepare(doc uint32) {
 429	t.firstDone = true
 430	t.docID = doc
 431}
 432
 433// nextDoc
 434
 435func (t *docMatchTree) nextDoc() uint32 {
 436	var start uint32
 437	if t.firstDone {
 438		start = t.docID + 1
 439	}
 440	for i := start; i < t.numDocs; i++ {
 441		if t.predicate(i) {
 442			return i
 443		}
 444	}
 445	return maxUInt32
 446}
 447
 448func (t *bruteForceMatchTree) nextDoc() uint32 {
 449	if !t.firstDone {
 450		return 0
 451	}
 452	return t.docID + 1
 453}
 454
 455func (t *andMatchTree) nextDoc() uint32 {
 456	var max uint32
 457	for _, c := range t.children {
 458		m := c.nextDoc()
 459		if m > max {
 460			max = m
 461		}
 462	}
 463	return max
 464}
 465
 466func (t *orMatchTree) nextDoc() uint32 {
 467	min := uint32(maxUInt32)
 468	for _, c := range t.children {
 469		m := c.nextDoc()
 470		if m < min {
 471			min = m
 472		}
 473	}
 474	return min
 475}
 476
 477func (t *notMatchTree) nextDoc() uint32 {
 478	return 0
 479}
 480
 481func (t *fileNameMatchTree) nextDoc() uint32 {
 482	return t.child.nextDoc()
 483}
 484
 485func (t *boostMatchTree) nextDoc() uint32 {
 486	return t.child.nextDoc()
 487}
 488
 489func (t *branchQueryMatchTree) nextDoc() uint32 {
 490	var start uint32
 491	if t.firstDone {
 492		start = t.docID + 1
 493	}
 494
 495	for i := start; i < uint32(len(t.fileMasks)); i++ {
 496		if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 {
 497			return i
 498		}
 499	}
 500	return maxUInt32
 501}
 502
 503// all String methods
 504
 505func (t *bruteForceMatchTree) String() string {
 506	return "all"
 507}
 508
 509func (t *docMatchTree) String() string {
 510	return fmt.Sprintf("doc(%s)", t.reason)
 511}
 512
 513func (t *andMatchTree) String() string {
 514	return fmt.Sprintf("and%v", t.children)
 515}
 516
 517func (t *regexpMatchTree) String() string {
 518	f := ""
 519	if t.fileName {
 520		f = "f"
 521	}
 522	return fmt.Sprintf("%sre(%s)", f, t.regexp)
 523}
 524
 525func (t *wordMatchTree) String() string {
 526	f := ""
 527	if t.fileName {
 528		f = "f"
 529	}
 530	return fmt.Sprintf("%sword(%s)", f, t.word)
 531}
 532
 533func (t *orMatchTree) String() string {
 534	return fmt.Sprintf("or%v", t.children)
 535}
 536
 537func (t *notMatchTree) String() string {
 538	return fmt.Sprintf("not(%v)", t.child)
 539}
 540
 541func (t *noVisitMatchTree) String() string {
 542	return fmt.Sprintf("novisit(%v)", t.matchTree)
 543}
 544
 545func (t *fileNameMatchTree) String() string {
 546	return fmt.Sprintf("f(%v)", t.child)
 547}
 548
 549func (t *boostMatchTree) String() string {
 550	return fmt.Sprintf("boost(%f, %v)", t.boost, t.child)
 551}
 552
 553func (t *substrMatchTree) String() string {
 554	f := ""
 555	if t.fileName {
 556		f = "f"
 557	}
 558
 559	return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator)
 560}
 561
 562func (t *branchQueryMatchTree) String() string {
 563	return fmt.Sprintf("branch(%x)", t.masks)
 564}
 565
 566func (t *symbolSubstrMatchTree) String() string {
 567	return fmt.Sprintf("symbol(%v)", t.substrMatchTree)
 568}
 569
 570func (t *symbolRegexpMatchTree) String() string {
 571	return fmt.Sprintf("symbol(%v)", t.matchTree)
 572}
 573
 574// visitMatches visits all atoms in matchTree. Note: This visits
 575// noVisitMatchTree. For collecting matches use visitMatches.
 576func visitMatchTree(t matchTree, f func(matchTree)) {
 577	switch s := t.(type) {
 578	case *andMatchTree:
 579		for _, ch := range s.children {
 580			visitMatchTree(ch, f)
 581		}
 582	case *orMatchTree:
 583		for _, ch := range s.children {
 584			visitMatchTree(ch, f)
 585		}
 586	case *andLineMatchTree:
 587		visitMatchTree(&s.andMatchTree, f)
 588	case *noVisitMatchTree:
 589		visitMatchTree(s.matchTree, f)
 590	case *notMatchTree:
 591		visitMatchTree(s.child, f)
 592	case *fileNameMatchTree:
 593		visitMatchTree(s.child, f)
 594	case *boostMatchTree:
 595		visitMatchTree(s.child, f)
 596	case *symbolSubstrMatchTree:
 597		visitMatchTree(s.substrMatchTree, f)
 598	case *symbolRegexpMatchTree:
 599		visitMatchTree(s.matchTree, f)
 600	default:
 601		f(t)
 602	}
 603}
 604
 605// updateMatchTreeStats calls updateStats on all atoms in mt which have that
 606// function defined.
 607func updateMatchTreeStats(mt matchTree, stats *Stats) {
 608	visitMatchTree(mt, func(mt matchTree) {
 609		if atom, ok := mt.(interface{ updateStats(*Stats) }); ok {
 610			atom.updateStats(stats)
 611		}
 612	})
 613}
 614
 615func visitMatchAtoms(t matchTree, known map[matchTree]bool, f func(matchTree)) {
 616	visitMatches(t, known, 1, func(mt matchTree, _ float64) {
 617		f(mt)
 618	})
 619}
 620
 621// visitMatches visits all atoms which can contribute matches. Note: This
 622// skips noVisitMatchTree.
 623func visitMatches(t matchTree, known map[matchTree]bool, weight float64, f func(matchTree, float64)) {
 624	switch s := t.(type) {
 625	case *andMatchTree:
 626		for _, ch := range s.children {
 627			if known[ch] {
 628				visitMatches(ch, known, weight, f)
 629			}
 630		}
 631	case *andLineMatchTree:
 632		visitMatches(&s.andMatchTree, known, weight, f)
 633	case *orMatchTree:
 634		for _, ch := range s.children {
 635			if known[ch] {
 636				visitMatches(ch, known, weight, f)
 637			}
 638		}
 639	case *boostMatchTree:
 640		visitMatches(s.child, known, weight*s.boost, f)
 641	case *symbolSubstrMatchTree:
 642		visitMatches(s.substrMatchTree, known, weight, f)
 643	case *notMatchTree:
 644	case *noVisitMatchTree:
 645		// don't collect into negative trees.
 646	case *fileNameMatchTree:
 647		// We will just gather the filename if we do not visit this tree.
 648	default:
 649		f(s, weight)
 650	}
 651}
 652
 653// all matches() methods.
 654
 655func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 656	return matchesStatePred(t.predicate(cp.idx))
 657}
 658
 659func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 660	return matchesFound
 661}
 662
 663// andLineMatchTree is a performance optimization of andMatchTree. For content
 664// searches we don't want to run the regex engine if there is no line that
 665// contains matches from all terms.
 666func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 667	if state := evalMatchTree(cp, cost, known, &t.andMatchTree); state != matchesFound {
 668		return state
 669	}
 670
 671	// Invariant: all children have matches. If any line contains all of them we
 672	// can return MatchesFound.
 673
 674	// find child with fewest candidates
 675	min := maxUInt32
 676	fewestChildren := 0
 677	for ix, child := range t.children {
 678		v, ok := child.(*substrMatchTree)
 679		// make sure we are running a content search and that all candidates are a
 680		// substrMatchTree
 681		if !ok || v.fileName {
 682			return matchesFound
 683		}
 684		if len(v.current) < min {
 685			min = len(v.current)
 686			fewestChildren = ix
 687		}
 688	}
 689
 690	type lineRange struct {
 691		start int
 692		end   int
 693	}
 694	lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current))
 695	prev := -1
 696	for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current {
 697		line, byteStart, byteEnd := cp.newlines().atOffset(candidate.byteOffset)
 698		if line == prev {
 699			continue
 700		}
 701		prev = line
 702		lines = append(lines, lineRange{byteStart, byteEnd})
 703	}
 704
 705	// children keeps track of the children's candidates we have already seen.
 706	children := make([][]*candidateMatch, 0, len(t.children)-1)
 707	for j, child := range t.children {
 708		if j == fewestChildren {
 709			continue
 710		}
 711		children = append(children, child.(*substrMatchTree).current)
 712	}
 713
 714nextLine:
 715	for i := 0; i < len(lines); i++ {
 716		hits := 1
 717	nextChild:
 718		for j := range children {
 719		nextCandidate:
 720			for len(children[j]) > 0 {
 721				candidate := children[j][0]
 722				bo := int(cp.findOffset(false, candidate.runeOffset))
 723				if bo < lines[i].start {
 724					children[j] = children[j][1:]
 725					continue nextCandidate
 726				}
 727				if bo <= lines[i].end {
 728					hits++
 729					continue nextChild
 730				}
 731				// move the `lines` iterator forward until bo <= line.end
 732				for i < len(lines) && bo > lines[i].end {
 733					i++
 734				}
 735				i--
 736				continue nextLine
 737			}
 738		}
 739		// return early once we found any line that contains matches from all children
 740		if hits == len(t.children) {
 741			return matchesFound
 742		}
 743	}
 744	return matchesNone
 745}
 746
 747func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 748	// We have found matches unless a child needs to do more work or it hasn't
 749	// found matches.
 750	state := matchesFound
 751
 752	for _, ch := range t.children {
 753		switch evalMatchTree(cp, cost, known, ch) {
 754		case matchesRequiresHigherCost:
 755			// keep evaluating other children incase we come across matchesNone
 756			state = matchesRequiresHigherCost
 757		case matchesFound:
 758			// will return this if every child has this value
 759		case matchesNone:
 760			return matchesNone
 761		}
 762	}
 763
 764	return state
 765}
 766
 767func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 768	// we could short-circuit, but we want to use the other possibilities as a
 769	// ranking signal. So we always return the most conservative state.
 770	state := matchesNone
 771	for _, ch := range t.children {
 772		switch evalMatchTree(cp, cost, known, ch) {
 773		case matchesRequiresHigherCost:
 774			state = matchesRequiresHigherCost
 775		case matchesFound:
 776			if state != matchesRequiresHigherCost {
 777				state = matchesFound
 778			}
 779		case matchesNone:
 780			// noop
 781		}
 782	}
 783	return state
 784}
 785
 786func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 787	return matchesStatePred(t.branchMask() != 0)
 788}
 789
 790func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 791	if t.reEvaluated {
 792		return matchesStateForSlice(t.found)
 793	}
 794
 795	if cost < costRegexp {
 796		return matchesRequiresHigherCost
 797	}
 798
 799	cp.stats.RegexpsConsidered++
 800	idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1)
 801	found := t.found[:0]
 802	for _, idx := range idxs {
 803		cm := &candidateMatch{
 804			byteOffset:  uint32(idx[0]),
 805			byteMatchSz: uint32(idx[1] - idx[0]),
 806			fileName:    t.fileName,
 807		}
 808
 809		found = append(found, cm)
 810	}
 811	t.found = found
 812	t.reEvaluated = true
 813
 814	return matchesStateForSlice(t.found)
 815}
 816
 817func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 818	if t.evaluated {
 819		return matchesStateForSlice(t.found)
 820	}
 821
 822	if cost < costRegexp {
 823		return matchesRequiresHigherCost
 824	}
 825
 826	data := cp.data(t.fileName)
 827	offset := 0
 828	found := t.found[:0]
 829	for {
 830		idx := bytes.Index(data[offset:], []byte(t.word))
 831		if idx < 0 {
 832			break
 833		}
 834
 835		relStartOffset := offset + idx
 836		relEndOffset := relStartOffset + len(t.word)
 837
 838		startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1]))
 839		endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset]))
 840		if startBoundary && endBoundary {
 841			found = append(found, &candidateMatch{
 842				byteOffset:  uint32(offset + idx),
 843				byteMatchSz: uint32(len(t.word)),
 844				fileName:    t.fileName,
 845			})
 846		}
 847		offset += idx + len(t.word)
 848	}
 849
 850	t.found = found
 851	t.evaluated = true
 852
 853	return matchesStateForSlice(t.found)
 854}
 855
 856// breakMatchesOnNewlines returns matches resulting from breaking each element
 857// of cms on newlines within text.
 858func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch {
 859	var lineCMs []*candidateMatch
 860	for _, cm := range cms {
 861		lineCMs = append(lineCMs, breakOnNewlines(cm, text)...)
 862	}
 863	return lineCMs
 864}
 865
 866// breakOnNewlines returns matches resulting from breaking cm on newlines
 867// within text.
 868func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch {
 869	var cms []*candidateMatch
 870	addMe := &candidateMatch{}
 871	*addMe = *cm
 872	for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ {
 873		if text[i] == '\n' {
 874			addMe.byteMatchSz = i - addMe.byteOffset
 875			if addMe.byteMatchSz != 0 {
 876				cms = append(cms, addMe)
 877			}
 878
 879			addMe = &candidateMatch{}
 880			*addMe = *cm
 881			addMe.byteOffset = i + 1
 882		}
 883	}
 884	addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset
 885	if addMe.byteMatchSz != 0 {
 886		cms = append(cms, addMe)
 887	}
 888	return cms
 889}
 890
 891// evalMatchTree should be called instead of directly calling
 892// matchTree.matches. It cache known values for future evaluation at higher
 893// costs.
 894func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) matchesState {
 895	if v, ok := known[mt]; ok {
 896		return matchesStatePred(v)
 897	}
 898
 899	ms := mt.matches(cp, cost, known)
 900	if ms != matchesRequiresHigherCost {
 901		known[mt] = ms == matchesFound
 902	}
 903
 904	return ms
 905}
 906
 907func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 908	switch evalMatchTree(cp, cost, known, t.child) {
 909	case matchesRequiresHigherCost:
 910		return matchesRequiresHigherCost
 911	case matchesFound:
 912		return matchesNone
 913	case matchesNone:
 914		return matchesFound
 915	default:
 916		panic("unreachable")
 917	}
 918}
 919
 920func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 921	return evalMatchTree(cp, cost, known, t.child)
 922}
 923
 924func (t *boostMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 925	return evalMatchTree(cp, cost, known, t.child)
 926}
 927
 928func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 929	if t.contEvaluated {
 930		return matchesStateForSlice(t.current)
 931	}
 932
 933	if len(t.current) == 0 {
 934		return matchesNone
 935	}
 936
 937	if t.fileName && cost < costMemory {
 938		return matchesRequiresHigherCost
 939	}
 940
 941	if !t.fileName && cost < costContent {
 942		return matchesRequiresHigherCost
 943	}
 944
 945	pruned := t.current[:0]
 946	for _, m := range t.current {
 947		if m.byteOffset == 0 && m.runeOffset > 0 {
 948			m.byteOffset = cp.findOffset(m.fileName, m.runeOffset)
 949		}
 950		if m.matchContent(cp.data(m.fileName)) {
 951			pruned = append(pruned, m)
 952		}
 953	}
 954	t.current = pruned
 955	t.contEvaluated = true
 956
 957	return matchesStateForSlice(t.current)
 958}
 959
 960type matchTreeOpt struct {
 961	// DisableWordMatchOptimization is used to disable the use of wordMatchTree.
 962	// This was added since we do not support wordMatchTree with symbol search.
 963	DisableWordMatchOptimization bool
 964}
 965
 966func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) {
 967	if q == nil {
 968		return nil, fmt.Errorf("got nil (sub)query")
 969	}
 970	switch s := q.(type) {
 971	case *query.Regexp:
 972		// RegexpToMatchTreeRecursive tries to distill a matchTree that matches a
 973		// superset of the regexp. If the returned matchTree is equivalent to the
 974		// original regexp, it returns true. An equivalent matchTree has the same
 975		// behaviour as the original regexp and can be used instead.
 976		//
 977		subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive)
 978		if err != nil {
 979			return nil, err
 980		}
 981		// if the query can be used in place of the regexp
 982		// return the subtree
 983		if isEq {
 984			return subMT, nil
 985		}
 986
 987		var tr matchTree
 988		if wmt, ok := regexpToWordMatchTree(s, opt); ok {
 989			// A common search we get is "\bLITERAL\b". Avoid the regex engine and
 990			// provide something faster.
 991			tr = wmt
 992		} else {
 993			tr = newRegexpMatchTree(s)
 994		}
 995
 996		return &andMatchTree{
 997			children: []matchTree{
 998				tr, &noVisitMatchTree{subMT},
 999			},
1000		}, nil
1001	case *query.And:
1002		var r []matchTree
1003		for _, ch := range s.Children {
1004			ct, err := d.newMatchTree(ch, opt)
1005			if err != nil {
1006				return nil, err
1007			}
1008			r = append(r, ct)
1009		}
1010		return &andMatchTree{r}, nil
1011	case *query.Or:
1012		var r []matchTree
1013		for _, ch := range s.Children {
1014			ct, err := d.newMatchTree(ch, opt)
1015			if err != nil {
1016				return nil, err
1017			}
1018			r = append(r, ct)
1019		}
1020		return &orMatchTree{r}, nil
1021	case *query.Not:
1022		ct, err := d.newMatchTree(s.Child, opt)
1023		return &notMatchTree{
1024			child: ct,
1025		}, err
1026
1027	case *query.Type:
1028		if s.Type != query.TypeFileName {
1029			break
1030		}
1031
1032		ct, err := d.newMatchTree(s.Child, opt)
1033		if err != nil {
1034			return nil, err
1035		}
1036
1037		return &fileNameMatchTree{
1038			child: ct,
1039		}, nil
1040
1041	case *query.Boost:
1042		ct, err := d.newMatchTree(s.Child, opt)
1043		if err != nil {
1044			return nil, err
1045		}
1046
1047		return &boostMatchTree{
1048			child: ct,
1049			boost: s.Boost,
1050		}, nil
1051
1052	case *query.Substring:
1053		return d.newSubstringMatchTree(s)
1054
1055	case *query.Branch:
1056		masks := make([]uint64, 0, len(d.repoMetaData))
1057		if s.Pattern == "HEAD" {
1058			for i := 0; i < len(d.repoMetaData); i++ {
1059				masks = append(masks, 1)
1060			}
1061		} else {
1062			for _, branchIDs := range d.branchIDs {
1063				mask := uint64(0)
1064				for nm, m := range branchIDs {
1065					if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) {
1066						mask |= uint64(m)
1067					}
1068				}
1069				masks = append(masks, mask)
1070			}
1071		}
1072		return &branchQueryMatchTree{
1073			masks:     masks,
1074			fileMasks: d.fileBranchMasks,
1075			repos:     d.repos,
1076		}, nil
1077	case *query.Const:
1078		if s.Value {
1079			return &bruteForceMatchTree{}, nil
1080		} else {
1081			return &noMatchTree{Why: "const"}, nil
1082		}
1083	case *query.Language:
1084		code, ok := d.metaData.LanguageMap[s.Language]
1085		if !ok {
1086			return &noMatchTree{Why: "lang"}, nil
1087		}
1088		return &docMatchTree{
1089			reason:  "language",
1090			numDocs: d.numDocs(),
1091			predicate: func(docID uint32) bool {
1092				return d.getLanguage(docID) == code
1093			},
1094		}, nil
1095
1096	case *query.Symbol:
1097		// Disable WordMatchTree since we don't support it in symbols yet.
1098		optCopy := opt
1099		optCopy.DisableWordMatchOptimization = true
1100
1101		subMT, err := d.newMatchTree(s.Expr, optCopy)
1102		if err != nil {
1103			return nil, err
1104		}
1105
1106		if substr, ok := subMT.(*substrMatchTree); ok {
1107			return &symbolSubstrMatchTree{
1108				substrMatchTree: substr,
1109				patternSize:     uint32(utf8.RuneCountInString(substr.query.Pattern)),
1110				fileEndRunes:    d.fileEndRunes,
1111				fileEndSymbol:   d.fileEndSymbol,
1112				sections:        d.runeDocSections,
1113			}, nil
1114		}
1115
1116		var regexpMT *regexpMatchTree
1117		visitMatchTree(subMT, func(mt matchTree) {
1118			if t, ok := mt.(*regexpMatchTree); ok {
1119				regexpMT = t
1120			}
1121		})
1122		if regexpMT == nil {
1123			return nil, fmt.Errorf("found %T inside query.Symbol", subMT)
1124		}
1125
1126		return &symbolRegexpMatchTree{
1127			regexp:    regexpMT.regexp,
1128			all:       isRegexpAll(regexpMT.origRegexp),
1129			matchTree: subMT,
1130		}, nil
1131
1132	case *query.FileNameSet:
1133		return &docMatchTree{
1134			reason:  "FileNameSet",
1135			numDocs: d.numDocs(),
1136			predicate: func(docID uint32) bool {
1137				fileName := d.fileName(docID)
1138				_, ok := s.Set[string(fileName)]
1139				return ok
1140			},
1141		}, nil
1142
1143	case *query.BranchesRepos:
1144		reposBranchesWant := make([]uint64, len(d.repoMetaData))
1145		for repoIdx := range d.repoMetaData {
1146			var mask uint64
1147			for _, br := range s.List {
1148				if br.Repos.Contains(d.repoMetaData[repoIdx].ID) {
1149					mask |= uint64(d.branchIDs[repoIdx][br.Branch])
1150				}
1151			}
1152			reposBranchesWant[repoIdx] = mask
1153		}
1154		return &docMatchTree{
1155			reason:  "BranchesRepos",
1156			numDocs: d.numDocs(),
1157			predicate: func(docID uint32) bool {
1158				return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0
1159			},
1160		}, nil
1161
1162	case *query.RepoSet:
1163		reposWant := make([]bool, len(d.repoMetaData))
1164		for repoIdx, r := range d.repoMetaData {
1165			if _, ok := s.Set[r.Name]; ok {
1166				reposWant[repoIdx] = true
1167			}
1168		}
1169		return &docMatchTree{
1170			reason:  "RepoSet",
1171			numDocs: d.numDocs(),
1172			predicate: func(docID uint32) bool {
1173				return reposWant[d.repos[docID]]
1174			},
1175		}, nil
1176
1177	case *query.RepoIDs:
1178		reposWant := make([]bool, len(d.repoMetaData))
1179		for repoIdx, r := range d.repoMetaData {
1180			if s.Repos.Contains(r.ID) {
1181				reposWant[repoIdx] = true
1182			}
1183		}
1184		return &docMatchTree{
1185			reason:  "RepoIDs",
1186			numDocs: d.numDocs(),
1187			predicate: func(docID uint32) bool {
1188				return reposWant[d.repos[docID]]
1189			},
1190		}, nil
1191
1192	case *query.Repo:
1193		reposWant := make([]bool, len(d.repoMetaData))
1194		for repoIdx, r := range d.repoMetaData {
1195			if s.Regexp.MatchString(r.Name) {
1196				reposWant[repoIdx] = true
1197			}
1198		}
1199		return &docMatchTree{
1200			reason:  "Repo",
1201			numDocs: d.numDocs(),
1202			predicate: func(docID uint32) bool {
1203				return reposWant[d.repos[docID]]
1204			},
1205		}, nil
1206
1207	case *query.RepoRegexp:
1208		reposWant := make([]bool, len(d.repoMetaData))
1209		for repoIdx, r := range d.repoMetaData {
1210			if s.Regexp.MatchString(r.Name) {
1211				reposWant[repoIdx] = true
1212			}
1213		}
1214		return &docMatchTree{
1215			reason:  "RepoRegexp",
1216			numDocs: d.numDocs(),
1217			predicate: func(docID uint32) bool {
1218				return reposWant[d.repos[docID]]
1219			},
1220		}, nil
1221
1222	case query.RawConfig:
1223		return &docMatchTree{
1224			reason:  s.String(),
1225			numDocs: d.numDocs(),
1226			predicate: func(docID uint32) bool {
1227				return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s)
1228			},
1229		}, nil
1230	}
1231	log.Panicf("type %T", q)
1232	return nil, nil
1233}
1234
1235func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) {
1236	st := &substrMatchTree{
1237		query:         s,
1238		caseSensitive: s.CaseSensitive,
1239		fileName:      s.FileName,
1240	}
1241
1242	if utf8.RuneCountInString(s.Pattern) < ngramSize {
1243		return newRegexpMatchTree(&query.Regexp{
1244			Regexp:        &syntax.Regexp{Op: syntax.OpLiteral, Rune: []rune(s.Pattern)},
1245			FileName:      s.FileName,
1246			Content:       s.Content,
1247			CaseSensitive: s.CaseSensitive,
1248		}), nil
1249	}
1250
1251	result, err := d.iterateNgrams(s)
1252	if err != nil {
1253		return nil, err
1254	}
1255	st.matchIterator = result
1256	return st, nil
1257}
1258
1259func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) {
1260	if opt.DisableWordMatchOptimization {
1261		return nil, false
1262	}
1263	// Needs to be case sensitive
1264	if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 {
1265		return nil, false
1266	}
1267	// We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary]
1268	if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 {
1269		return nil, false
1270	}
1271	sub := q.Regexp.Sub
1272	if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary {
1273		return nil, false
1274	}
1275
1276	return &wordMatchTree{
1277		word:     string(sub[1].Rune),
1278		fileName: q.FileName,
1279	}, true
1280}
1281
1282// pruneMatchTree removes impossible branches from the matchTree, as indicated
1283// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth.
1284func pruneMatchTree(mt matchTree) (matchTree, error) {
1285	var err error
1286	switch mt := mt.(type) {
1287	// leaf nodes that we test with the filter:
1288	case *substrMatchTree:
1289		if res, ok := mt.matchIterator.(*ngramIterationResults); ok {
1290			if _, ok := res.matchIterator.(*noMatchTree); ok {
1291				return nil, nil
1292			}
1293		}
1294	// recursive tree structures:
1295	case *andMatchTree:
1296		// Any branch of an and becoming impossible means the entire clause
1297		// is impossible. Otherwise, just handle rewrites.
1298		for i, child := range mt.children {
1299			newChild, err := pruneMatchTree(child)
1300			if err != nil {
1301				return nil, err
1302			}
1303			if newChild == nil {
1304				return nil, nil
1305			}
1306			mt.children[i] = newChild
1307		}
1308	case *orMatchTree:
1309		// *All* branches of an OR becoming impossible means the entire clause
1310		// is impossible. Otherwise, drop impossible subclauses and handle
1311		// rewrites, including simplifying to a singular resulting child branch.
1312		n := 0
1313		for _, child := range mt.children {
1314			newChild, err := pruneMatchTree(child)
1315			if err != nil {
1316				return nil, err
1317			}
1318			if newChild != nil {
1319				mt.children[n] = newChild
1320				n++
1321			}
1322		}
1323		mt.children = mt.children[:n]
1324		if len(mt.children) == 1 {
1325			return mt.children[0], nil
1326		} else if len(mt.children) == 0 {
1327			return nil, nil
1328		}
1329	case *noVisitMatchTree:
1330		mt.matchTree, err = pruneMatchTree(mt.matchTree)
1331		if err != nil {
1332			return nil, err
1333		}
1334		if mt.matchTree == nil {
1335			return nil, nil
1336		}
1337	case *fileNameMatchTree:
1338		mt.child, err = pruneMatchTree(mt.child)
1339		if err != nil {
1340			return nil, err
1341		}
1342		if mt.child == nil {
1343			return nil, nil
1344		}
1345	case *boostMatchTree:
1346		mt.child, err = pruneMatchTree(mt.child)
1347		if err != nil {
1348			return nil, err
1349		}
1350		if mt.child == nil {
1351			return nil, nil
1352		}
1353	case *andLineMatchTree:
1354		child, err := pruneMatchTree(&mt.andMatchTree)
1355		if err != nil {
1356			return nil, err
1357		}
1358		if child == nil {
1359			return nil, nil
1360		}
1361		if c, ok := child.(*andMatchTree); ok {
1362			mt.andMatchTree = *c
1363		} else {
1364			// the and was simplified to a single clause,
1365			// so the linematch portion is irrelevant.
1366			return mt, nil
1367		}
1368	case *notMatchTree:
1369		mt.child, err = pruneMatchTree(mt.child)
1370		if err != nil {
1371			return nil, err
1372		}
1373		if mt.child == nil {
1374			// not false => true
1375			return &bruteForceMatchTree{}, nil
1376		}
1377	// unhandled:
1378	case *docMatchTree:
1379	case *bruteForceMatchTree:
1380	case *regexpMatchTree:
1381	case *wordMatchTree:
1382	}
1383	return mt, err
1384}
1385
1386// isRegexpAll returns true if the query matches all possible lines.
1387//
1388// Note: it is possible for a funky regex to actually match all but this
1389// returns false. This returns true for normal looking regexes like ".*" or
1390// "(?-s:.*)".
1391func isRegexpAll(r *syntax.Regexp) bool {
1392	// Note: we don't care about any flags since we are looking for .* and we
1393	// don't mind if . matches all or everything but newline.
1394
1395	// for loop is for visiting children of OpCapture/OpConcat until we hit .*
1396	for {
1397		// Our main target: .*
1398		if r.Op == syntax.OpStar && len(r.Sub) == 1 { // *
1399			// (?s:.) or (?-s:.)
1400			return r.Sub[0].Op == syntax.OpAnyChar || r.Sub[0].Op == syntax.OpAnyCharNotNL
1401		}
1402
1403		// Strip away expressions being wrapped in paranthesis
1404		if (r.Op == syntax.OpCapture || r.Op == syntax.OpConcat) && len(r.Sub) == 1 {
1405			r = r.Sub[0]
1406			continue
1407		}
1408
1409		return false
1410	}
1411}
Configure Feed

Configure Feed