MariaDB数据库5.5.27 HASH JOIN源码解读

MariaDB数据库加入了对HASH JOIN算法的支持,我对HASH JOIN不了解,借此机会学习一下,测试的数据库版本为MariaDB5.5.27。

先是配置文件,这是我为了方便跟踪源码,在windows上建的环境。

 

D:mariadb-5.5.27sqlDebug>more my.ini
[mysqld]
innodb_file_per_table
optimizer_switch='index_condition_pushdown=on'
optimizer_switch='mrr=on'
optimizer_switch='mrr_sort_keys=on'
optimizer_switch='mrr_cost_based=off'
mrr_buffer_size=32M
optimizer_switch='join_cache_incremental=on'
optimizer_switch='join_cache_hashed=on'
optimizer_switch='join_cache_bka=on'
join_cache_level=4

 

数据库表结构:
CREATE TABLE `t1` (

`id` int(11) NOT NULL AUTO_INCREMENT,

`name` varchar(100) DEFAULT NULL,

`name_1` varchar(100) DEFAULT NULL,

PRIMARY KEY (`id`)

) ENGINE=InnoDB;



Create Table: CREATE TABLE `t3` (

`t3id` int(11) NOT NULL AUTO_INCREMENT,

`t3name` varchar(100) DEFAULT NULL,

PRIMARY KEY (`t3id`)

) ENGINE=InnoDB;

 

 

数据库表数据量:

MariaDB [tm]> select count(*) from t1;

+----------+

| count(*) |

+----------+

|        6 |

+----------+

1 row in set (0.00 sec)



MariaDB [tm]> select count(*) from t3;

+----------+

| count(*) |

+----------+

|   262144 |

+----------+

1 row in set (0.94 sec)

 

 

看一下执行计划:

 

explain select t3.t3name from t1,t3 where t1.name=t3.t3name;
id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 6 Using where
1 SIMPLE t3 hash_ALL NULL #hash#$hj 104 tm.t1.name 263785 Using where; Using join buffer (flat, BNLH join)
BNLH join就是Block nested loop hash join。

call stack:

•mysqld.exe!JOIN::exec()  行2833
• mysqld.exe!mysql_select(THD * thd=0x1c3c1fc0, Item * * * rref_pointer_array=0x1c3c3a84, TABLE_LIST * tables=0x1c44cf90, unsigned int wild_num=0, List<Item> & fields={...}, Item * conds=0x1c44d9b8, unsigned int og_num=0, st_order * order=0x00000000, st_order * group=0x00000000, Item * having=0x00000000, st_order * proc_param=0x00000000, unsigned __int64 select_options=2147748608, select_result * result=0x1c44db38, st_select_lex_unit * unit=0x1c3c3480, st_select_lex * select_lex=0x1c3c3940)  行3055
• mysqld.exe!handle_select(THD * thd=0x1c3c1fc0, LEX * lex=0x1c3c3418, select_result * result=0x1c44db38, unsigned long setup_tables_done_option=0)  行316 + 0x9e 字节
• mysqld.exe!execute_sqlcom_select(THD * thd=0x1c3c1fc0, TABLE_LIST * all_tables=0x1c44cf90)  行4621 + 0x13 字节
• mysqld.exe!mysql_execute_command(THD * thd=0x1c3c1fc0)  行2189 + 0xd 字节
• mysqld.exe!mysql_parse(THD * thd=0x1c3c1fc0, char * rawbuf=0x1c44cdc0, unsigned int length=51, Parser_state * parser_state=0x1ee3f9ec)  行5736 + 0x9 字节
• mysqld.exe!dispatch_command(enum_server_command command=COM_QUERY, THD * thd=0x1c3c1fc0, char * packet=0x1c444ad1, unsigned int packet_length=51)  行1055 + 0x22 字节
• mysqld.exe!do_command(THD * thd=0x1c3c1fc0)  行794 + 0x1b 字节
• mysqld.exe!threadpool_process_request(THD * thd=0x1c3c1fc0)  行225 + 0x9 字节
• mysqld.exe!io_completion_callback(_TP_CALLBACK_INSTANCE * instance=0x1ee3fd54, void * context=0x1c3f6350, void * overlapped=0x1c3f6358, unsigned long io_result=0, unsigned long nbytes=0, _TP_IO * io=0x003bce20)  行584 + 0xb 字节

sql_select.cc,  exec()

  thd_proc_info(thd, "Sending data");
DBUG_PRINT("info", ("%s", thd->proc_info));
result->send_result_set_metadata((procedure ? curr_join->procedure_fields_list :
*curr_fields_list),
Protocol::SEND_NUM_ROWS | Protocol::SEND_EOF);
error= do_select(curr_join, curr_fields_list, NULL, procedure);
看到Sending data,很亲切吧。

sql_select.cc,  do_select()

    DBUG_ASSERT(join->table_count);
if (join->outer_ref_cond && !join->outer_ref_cond->val_int())
error= NESTED_LOOP_NO_MORE_ROWS;
else
error= sub_select(join,join_tab,0);//本例中填充join buffer
if ((error == NESTED_LOOP_OK || error == NESTED_LOOP_NO_MORE_ROWS) &&
join->thd->killed != ABORT_QUERY)
error= sub_select(join,join_tab,1);//本例中t3与join buffer中记录匹配并发送结果
if (error == NESTED_LOOP_QUERY_LIMIT)
error= NESTED_LOOP_OK;                    /* select_limit used */

sql_select.cc,  sub_select()

 if (rc != NESTED_LOOP_NO_MORE_ROWS)
{
error= (*join_tab->read_first_record)(join_tab);//读t1的第一条记录
if (join_tab->keep_current_rowid)
join_tab->table->file->position(join_tab->table->record[0]);
rc= evaluate_join_record(join, join_tab, error);//处理第一条记录
}
/*
Note: psergey has added the 2nd part of the following condition; the
change should probably be made in 5.1, too.
*/
bool skip_over= FALSE;
while (rc == NESTED_LOOP_OK && join->return_tab >= join_tab)//循环读取和处理记录
{
if (join_tab->loosescan_match_tab &&
join_tab->loosescan_match_tab->found_match)
{
KEY *key= join_tab->table->key_info + join_tab->loosescan_key;
key_copy(join_tab->loosescan_buf, join_tab->table->record[0], key,
join_tab->loosescan_key_len);
skip_over= TRUE;
}error= info->read_record(info);//读一条记录if (skip_over && !error)
{
if(!key_cmp(join_tab->table->key_info[join_tab->loosescan_key].key_part,
join_tab->loosescan_buf, join_tab->loosescan_key_len))
{
/*
This is the LooseScan action: skip over records with the same key
value if we already had a match for them.
*/
continue;
}
join_tab->loosescan_match_tab->found_match= FALSE;
skip_over= FALSE;
}if (join_tab->keep_current_rowid)
join_tab->table->file->position(join_tab->table->record[0]);rc= evaluate_join_record(join, join_tab, error);//处理一条记录
}

先读哪个表?

很明确是读tm.t1表。

读到什么记录?

看到了,字段值是name1

•evaluate_join_record处理一行记录

    if (found)
{
enum enum_nested_loop_state rc;
/* A match from join_tab is found for the current partial join. */
rc= (*join_tab->next_select)(join, join_tab+1, 0);//本例此处进入sub_select_cache
join->thd->warning_info->inc_current_row_for_warning();
if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
DBUG_RETURN(rc);
if (return_tab < join->return_tab)
join->return_tab= return_tab;if (join->return_tab < join_tab)
DBUG_RETURN(NESTED_LOOP_OK);
/*
Test if this was a SELECT DISTINCT query on a table that
was not in the field list;  In this case we can abort if
we found a row, as no new rows can be added to the result.
*/
if (not_used_in_distinct && found_records != join->found_records)
DBUG_RETURN(NESTED_LOOP_NO_MORE_ROWS);
}
else
{
join->thd->warning_info->inc_current_row_for_warning();
join_tab->read_record.unlock_row(join_tab);
}
}

—sub_select_cache向join buffer里写记录

  if (!test_if_use_dynamic_range_scan(join_tab))
{
if (!cache->put_record())
DBUG_RETURN(NESTED_LOOP_OK);
/*
We has decided that after the record we've just put into the buffer
won't add any more records. Now try to find all the matching
extensions for all records in the buffer.
*/
rc= cache->join_records(FALSE);//join buffer写满了,就会执行到这步
DBUG_RETURN(rc);
}

本例中调用到专门为HASH JOIN实现的put_record

bool JOIN_CACHE_HASHED::put_record()
{
bool is_full;
uchar *key;
uint key_len= key_length;
uchar *key_ref_ptr;
uchar *link= 0;
TABLE_REF *ref= &join_tab->ref;
uchar *next_ref_ptr= pos;pos+= get_size_of_rec_offset();
/* Write the record into the join buffer */
if (prev_cache)
link= prev_cache->get_curr_rec_link();
write_record_data(link, &is_full);

向join buffer里写完所有t1表记录,回到do_select,再调

sub_select

else
{
DBUG_ASSERT(join->table_count);
if (join->outer_ref_cond && !join->outer_ref_cond->val_int())
error= NESTED_LOOP_NO_MORE_ROWS;
else
error= sub_select(join,join_tab,0);
if ((error == NESTED_LOOP_OK || error == NESTED_LOOP_NO_MORE_ROWS) &&
join->thd->killed != ABORT_QUERY)
error= sub_select(join,join_tab,1);//再次调用sub_select,本例中要开始对t3表处理
if (error == NESTED_LOOP_QUERY_LIMIT)
enum_nested_loop_state
sub_select(JOIN *join,JOIN_TAB *join_tab,bool end_of_records)
{
DBUG_ENTER("sub_select");join_tab->table->null_row=0;
if (end_of_records)
{
enum_nested_loop_state nls=
(*join_tab->next_select)(join,join_tab+1,end_of_records);//end_of_records是1,从此处进入sub_select_cache
DBUG_RETURN(nls);
}

•sub_select_cache

  if (end_of_records)
{
rc= cache->join_records(FALSE);
if (rc == NESTED_LOOP_OK || rc == NESTED_LOOP_NO_MORE_ROWS)
rc= sub_select(join, join_tab, end_of_records);
DBUG_RETURN(rc);
}

—JOIN_CACHE::join_records

  if (!join_tab->first_unmatched)
{
/* Find all records from join_tab that match records from join buffer */
rc= join_matching_records(skip_last);
if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
goto finish;

—JOIN_CACHE::join_matching_records

  /* Prepare to retrieve all records of the joined table */
if ((error= join_tab_scan->open()))//要读t3表的记录了
{
/*
TODO: if we get here, we will assert in net_send_statement(). Add test
coverage and fix.
*/
goto finish;
}

—JOIN_TAB_SCAN::open()

int JOIN_TAB_SCAN::open()
{
save_or_restore_used_tabs(join_tab, FALSE);
is_first_record= TRUE;
return join_init_read_record(join_tab);
}

—join_init_read_record

int join_init_read_record(JOIN_TAB *tab)
{
if (tab->select && tab->select->quick && tab->select->quick->reset())
return 1;
if (!tab->preread_init_done && tab->preread_init())
return 1;
if (init_read_record(&tab->read_record, tab->join->thd, tab->table,
tab->select,1,1, FALSE))
return 1;
return (*tab->read_record.read_record)(&tab->read_record);//开始读第一条记录了
}

在读哪个表了?

读到了哪条记录?name2

—JOIN_CACHE::join_matching_records

  while (!(error= join_tab_scan->next()))
{
if (join->thd->killed)
{
/* The user has aborted the execution of the query */
join->thd->send_kill_message();
rc= NESTED_LOOP_KILLED;
goto finish;
}if (join_tab->keep_current_rowid)
join_tab->table->file->position(join_tab->table->record[0]);/* Prepare to read matching candidates from the join buffer */
    /*本例中,此处通过HASH表定位出所有匹配t3表当前一条记录的KEY*/
if (prepare_look_for_matches(skip_last))
continue;uchar *rec_ptr;
/* Read each possible candidate from the buffer and look for matches */
    /* 在HASH JOIN中,此处只循环满足匹配条件的KEY*/
   /*在不支持HASH JOIN的过程中,此处陆续读出join buffer中所有记录*/
   /*HASH JOIN的优势就在于本循环的次数少*/
while ((rec_ptr= get_next_candidate_for_match()))
{
/*
If only the first match is needed, and, it has been already found for
the next record read from the join buffer, then the record is skipped.
Also those records that must be null complemented are not considered
as candidates for matches.
*/
if ((!check_only_first_match && !outer_join_first_inner) ||
!skip_next_candidate_for_match(rec_ptr))
{
read_next_candidate_for_match(rec_ptr);
rc= generate_full_extensions(rec_ptr);
if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
goto finish;
}
}
}

•JOIN_CACHE::generate_full_extensions

enum_nested_loop_state JOIN_CACHE::generate_full_extensions(uchar *rec_ptr)
{
enum_nested_loop_state rc= NESTED_LOOP_OK;
DBUG_ENTER(“JOIN_CACHE::generate_full_extensions”);

/*
Check whether the extended partial join record meets
the pushdown conditions.
*/
if (check_match(rec_ptr))
{
int res= 0;

if (!join_tab->check_weed_out_table ||
!(res= join_tab->check_weed_out_table->sj_weedout_check_row(join->thd)))
{
set_curr_rec_link(rec_ptr);
rc= (join_tab->next_select)(join, join_tab+1, 0);//此处进入end_send
if (rc != NESTED_LOOP_OK && rc != NESTED_LOOP_NO_MORE_ROWS)
{
reset(TRUE);
DBUG_RETURN(rc);
}
}
if (res == -1)
{
rc= NESTED_LOOP_ERROR;
DBUG_RETURN(rc);
}
}
else if (join->thd->is_error())
rc= NESTED_LOOP_ERROR;
DBUG_RETURN(rc);
}

 

end_send

if (join->do_send_rows)
{
int error;
/* result < 0 if row was not accepted and should not be counted */
if ((error= join->result->send_data(*join->fields)))//再跟踪,就到网络接口了,应该是发送记录了
DBUG_RETURN(error < 0 ? NESTED_LOOP_OK : NESTED_LOOP_ERROR);
}

—到此,找到并发送了一条满足条件的记录
—之后继续循环读取t3记录,做同样处理

两种explain结果比较

id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 6 Using where
1 SIMPLE t3 hash_ALL NULL #hash#$hj 104 tm.t1.name 262038 Using where; Using join buffer (flat, BNLH join)

上面是支持HASH JOIN时的计划,下面是不支持HASH JOIN时的计划

id select_type table type possible_keys key key_len ref rows Extra
1 SIMPLE t1 ALL NULL NULL NULL NULL 6
1 SIMPLE t3 ALL NULL NULL NULL NULL 261619 Using where; Using join buffer (flat, BNL join)

 

 

 Show profile比较

Status      HASH JOIN Duration
starting 0.000116
checking permissions 0.000010
checking permissions 0.000012
Opening tables 0.000064
System lock 0.000015
Table lock 0.000020
init 0.000056
optimizing 0.000037
statistics 0.000080
preparing 0.000077
executing 0.000008
Sending data 0.931718
end 0.000020
query end 0.000012
closing tables 0.000025
freeing items 0.000118
logging slow query 0.000005
cleaning up 0.000008
上面是支持HASH JOIN时的PROFILE
Status Duration
starting 0.000170
checking permissions 0.000014
checking permissions 0.000016
Opening tables 0.000094
System lock 0.000021
Table lock 0.000028
init 0.000072
optimizing 0.000061
statistics 0.000087
preparing 0.024370
executing 0.000016
Sending data 2.603823
end 0.000026
query end 0.000021
closing tables 0.000025
freeing items 0.000092
logging slow query 0.000005
cleaning up 0.000007
虽然只是优化减少了内层的循环次数,但性能也明显提升了。

Join_buffer写满时处理

•sub_select_cache
•满了则put_record返回1
•调用join_records,循环读大表全部记录,与当前hash中记录匹配
大表全部读完后,清空hash,再读小表填充HASH
  if (!test_if_use_dynamic_range_scan(join_tab))
{
if (!cache->put_record())
DBUG_RETURN(NESTED_LOOP_OK);
/*
We has decided that after the record we’ve just put into the buffer
won’t add any more records. Now try to find all the matching
extensions for all records in the buffer.
*/
rc= cache->join_records(FALSE);
DBUG_RETURN(rc);
}
因此join buffer写满,对BNLH或者BNL都是有很大影响的。每写满一次,都需要再扫一遍大表。

发表评论

电子邮件地址不会被公开。 必填项已用*标注

您可以使用这些HTML标签和属性: <a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <strike> <strong>