直接上代码进行演示,对比三种方式耗时情况!
示例代码1: 【循环读取数据,一条一条插入es数据库】
import re
import time
from elasticsearch import Elasticsearch
# 默认连接本地elasticsearch
es = Elasticsearch("http://localhost:9200")
# 将文件所有内容读取到此字符串中
all_str = ''
# 此列表每一个元素均为一整个<entry> </entry>全部内容
valid_list = []
# 保存所有的entry_touple元组 {id:entry_touple}
tuple_dict = {}
# 将元组一个个存进去
tuple_list = []
# 字典的索引id
id = []
# 保存文件读取行数 用于判断文件是否成功完全读取
total_num = 0
# 开始计时
time_start = time.time()
# 打开原始文件
file = open('./2007.xml', "r", encoding='UTF-8')
# 打开旧文件 逐行读
for line in file.readlines():
total_num += 1
line = line.replace('\n', '') # 将回车全部去除
line = line.replace(' ', '') # 将空格全部去除
all_str += line
# 结束计时
time_end = time.time()
print("共处理了", total_num, "行xml数据")
print("文件所有字符(字符串)长度为:", len(all_str))
print("文件处理花费了:", time_end - time_start, "秒")
# 正则表达式取字符串 <entry> </entry>
re_str_entry = r'<entry>(.+?)</entry>'
d = re.compile(re_str_entry)
# 取出原文件中所有的 <entry> </entry> 保存到列表中
list_entry = d.findall(all_str)
print('共有' + str(len(list_entry)) + '条<entry>数据')
sums = len(list_entry)
time_end2 = time.time()
print("取正则表达式取字符串<entry> </entry>花费了:", time_end2 - time_end, "秒")
# 遍历每一个entry将其中数据取出来
for data in list_entry:
# 正则表达式取字符串<name> </name>
re_str_name = r'<name>(.+?)</name>'
d = re.compile(re_str_name)
# 取出每个<entry> </entry>中的name字段
name = d.findall(data)
# print(name)
# 正则表达式取字符串<vuln-id> </vuln-id>
re_str_vul_id = r'<vuln-id>(.+?)</vuln-id>'
d = re.compile(re_str_vul_id)
vuln_id = d.findall(data)
# print(vuln_id)
# 正则表达式取字符串<published> </published>
re_str_published = r'<published>(.+?)</published>'
d = re.compile(re_str_published)
published = d.findall(data)
# print(published)
# 正则表达式取字符串<modified> </modified>
re_str_modified = r'<modified>(.+?)</modified>'
d = re.compile(re_str_modified)
modified = d.findall(data)
# print(modified)
# 正则表达式取字符串<source> </source>
re_str_source = r'<source>(.+?)</source>'
d = re.compile(re_str_source)
source = d.findall(data)
# print(source)
# 正则表达式取字符串<severity> </severity>
re_str_severity = r'<severity>(.+?)</severity>'
d = re.compile(re_str_severity)
severity = d.findall(data)
# print(severity)
# 正则表达式取字符串 <vuln-type> </vuln-type>
re_str_vuln_type = r'<vuln-type>(.+?)</vuln-type>'
d = re.compile(re_str_vuln_type)
# 取出每个<vuln-type> </vuln-type>中的字段
vuln_type = d.findall(data)
# print(vuln_type)
# 正则表达式取字符串 <thrtype> </thrtype>
re_str_thrtype = r'<thrtype>(.+?)</thrtype>'
d = re.compile(re_str_thrtype)
thrtype = d.findall(data)
# print(thrtype)
# 正则表达式取字符串 <vuln-descript> </vuln-descript>
re_str_vuln_descript = r'<vuln-descript>(.+?)</vuln-descript>'
d = re.compile(re_str_vuln_descript)
vuln_descript = d.findall(data)
# print(vuln_descript)
# 正则表达式取字符串 <product> </product>
re_str_product = r'<product>CPE:/(.+?):</product>'
d = re.compile(re_str_product)
# 注意product可能有多个
product = d.findall(data)
# print(product)
# 正则表达式取字符串 <vuln-solution> </vuln-solution>
re_str_vuln_solution = r'<vuln-solution>(.+?)</vuln-solution>'
d = re.compile(re_str_vuln_solution)
# 注意vuln_solution可能有多个
vuln_solution = d.findall(data)
# print(vuln_solution)
entry_tuple = (
name, vuln_id, published, modified, source, severity, vuln_type, thrtype, vuln_descript, product, vuln_solution)
# 将每一个<entry> </entry>中的数据按既定顺序存到一个元祖中
# 下一步再将所有的元组放到一个字典中,最后将字典的数据一条条放到数据库中
# print(entry_tuple[0:-1])
tuple_list.append(entry_tuple[0:])
print(len(tuple_list))
# 制作一个和tuple_list一样长的列表
for i in range(len(tuple_list)):
id.append(i)
# 形成字典类型
cve_dict = dict(zip(id, tuple_list))
time_end3 = time.time()
print("循环处理所有的<entry>花费了:", time_end3 - time_end2, "秒")
# 开始写入es数据库
for i in range(len(tuple_list)):
es.index(index='entry', id=i, body={
'name': cve_dict[i][0],
"vuln-id": cve_dict[i][1],
"published": cve_dict[i][2],
"modified": cve_dict[i][3],
"source": cve_dict[i][4],
"severity": cve_dict[i][5],
"vuln-type": cve_dict[i][6],
"thrtype": cve_dict[i][7],
"vuln-descript": cve_dict[i][8],
"product": cve_dict[i][9][0:],
"vuln-solution": cve_dict[i][10]})
print("向数据库中插入花费了:", time.time() - time_end3, "秒")
file.close()
运行结果:
示例代码2: 【循环读取数据,批量处理插入es数据库】
import re
import time
from elasticsearch import Elasticsearch
from elasticsearch import helpers # 批量处理数据
# 默认连接本地elasticsearch
es = Elasticsearch("http://localhost:9200")
# 将文件所有内容读取到此字符串中
all_str = ''
# 此列表每一个元素均为一整个<entry> </entry>全部内容
valid_list = []
# 保存所有的entry_touple元组 {id:entry_touple}
tuple_dict = {}
# 将元组一个个存进去
tuple_list = []
# 字典的索引id
id = []
# 保存文件读取行数 用于判断文件是否成功完全读取
total_num = 0
# 开始计时
time_start = time.time()
# 打开原始文件
file = open('./2007.xml', "r", encoding='UTF-8')
# 打开旧文件 逐行读
for line in file.readlines():
total_num += 1
line = line.replace('\n', '') # 将回车全部去除
line = line.replace(' ', '') # 将空格全部去除
all_str += line
# 结束计时
time_end = time.time()
print("共处理了", total_num, "行xml数据")
print("文件所有字符(字符串)长度为:", len(all_str))
print("文件处理花费了:", time_end - time_start, "秒")
# 正则表达式取字符串 <entry> </entry>
re_str_entry = r'<entry>(.+?)</entry>'
d = re.compile(re_str_entry)
# 取出原文件中所有的 <entry> </entry> 保存到列表中
list_entry = d.findall(all_str)
print('共有' + str(len(list_entry)) + '条<entry>数据')
sums = len(list_entry)
time_end2 = time.time()
print("取正则表达式取字符串<entry> </entry>花费了:", time_end2 - time_end, "秒")
# 遍历每一个entry将其中数据取出来
for data in list_entry:
# 正则表达式取字符串<name> </name>
re_str_name = r'<name>(.+?)</name>'
d = re.compile(re_str_name)
# 取出每个<entry> </entry>中的name字段
name = d.findall(data)
# print(name)
# 正则表达式取字符串<vuln-id> </vuln-id>
re_str_vul_id = r'<vuln-id>(.+?)</vuln-id>'
d = re.compile(re_str_vul_id)
vuln_id = d.findall(data)
# print(vuln_id)
# 正则表达式取字符串<published> </published>
re_str_published = r'<published>(.+?)</published>'
d = re.compile(re_str_published)
published = d.findall(data)
# print(published)
# 正则表达式取字符串<modified> </modified>
re_str_modified = r'<modified>(.+?)</modified>'
d = re.compile(re_str_modified)
modified = d.findall(data)
# print(modified)
# 正则表达式取字符串<source> </source>
re_str_source = r'<source>(.+?)</source>'
d = re.compile(re_str_source)
source = d.findall(data)
# print(source)
# 正则表达式取字符串<severity> </severity>
re_str_severity = r'<severity>(.+?)</severity>'
d = re.compile(re_str_severity)
severity = d.findall(data)
# print(severity)
# 正则表达式取字符串 <vuln-type> </vuln-type>
re_str_vuln_type = r'<vuln-type>(.+?)</vuln-type>'
d = re.compile(re_str_vuln_type)
# 取出每个<vuln-type> </vuln-type>中的字段
vuln_type = d.findall(data)
# print(vuln_type)
# 正则表达式取字符串 <thrtype> </thrtype>
re_str_thrtype = r'<thrtype>(.+?)</thrtype>'
d = re.compile(re_str_thrtype)
thrtype = d.findall(data)
# print(thrtype)
# 正则表达式取字符串 <vuln-descript> </vuln-descript>
re_str_vuln_descript = r'<vuln-descript>(.+?)</vuln-descript>'
d = re.compile(re_str_vuln_descript)
vuln_descript = d.findall(data)
# print(vuln_descript)
# 正则表达式取字符串 <product> </product>
re_str_product = r'<product>CPE:/(.+?):</product>'
d = re.compile(re_str_product)
# 注意product可能有多个
product = d.findall(data)
# print(product)
# 正则表达式取字符串 <vuln-solution> </vuln-solution>
re_str_vuln_solution = r'<vuln-solution>(.+?)</vuln-solution>'
d = re.compile(re_str_vuln_solution)
# 注意vuln_solution可能有多个
vuln_solution = d.findall(data)
# print(vuln_solution)
entry_tuple = (
name, vuln_id, published, modified, source, severity, vuln_type, thrtype, vuln_descript, product, vuln_solution)
# 将每一个<entry> </entry>中的数据按既定顺序存到一个元祖中
# 下一步再将所有的元组放到一个字典中,最后将字典的数据一条条放到数据库中
# print(entry_tuple[0:-1])
tuple_list.append(entry_tuple[0:])
print(len(tuple_list))
# 制作一个和tuple_list一样长的列表
for i in range(len(tuple_list)):
id.append(i)
# 形成字典类型
cve_dict = dict(zip(id, tuple_list))
time_end3 = time.time()
print("循环处理所有的<entry>花费了:", time_end3 - time_end2, "秒")
# 开始写入es数据库
# 批量写入数据
action = [
{
"_index": "entry_bulk",
"_type": "doc",
"_source": {
"id": i,
'name': cve_dict[i][0],
"vuln-id": cve_dict[i][1],
"published": cve_dict[i][2],
"modified": cve_dict[i][3],
"source": cve_dict[i][4],
"severity": cve_dict[i][5],
"vuln-type": cve_dict[i][6],
"thrtype": cve_dict[i][7],
"vuln-descript": cve_dict[i][8],
"product": cve_dict[i][9][0:],
"vuln-solution": cve_dict[i][10]
}
} for i in range(len(tuple_list))]
helpers.bulk(es, action)
print("向数据库中插入花费了:", time.time() - time_end3, "秒")
file.close()
运行结果:
示例代码3: 【直接一次性读取数据,批量处理插入es数据库】
import re
import time
from elasticsearch import Elasticsearch
from elasticsearch import helpers # 批量处理数据
# 默认连接本地elasticsearch
es = Elasticsearch("http://localhost:9200")
# 将文件所有内容读取到此字符串中
all_str = ''
# 此列表每一个元素均为一整个<entry> </entry>全部内容
valid_list = []
# 保存所有的entry_touple元组 {id:entry_touple}
tuple_dict = {}
# 将元组一个个存进去
tuple_list = []
# 字典的索引id
id = []
# 保存文件读取行数 用于判断文件是否成功完全读取
total_num = 1
# 开始计时
time_start = time.time()
# 打开原始文件
file = open('./2007.xml', "r", encoding='UTF-8')
# # 打开旧文件 逐行读
# for line in file.readlines():
# total_num += 1
# line = line.replace('\n', '') # 将回车全部去除
# line = line.replace(' ', '') # 将空格全部去除
# all_str += line
# 将整个文件内容读取出来,存到all_str字符串变量中
data = file.read()
data = data.replace('\n', '')
all_str = data.replace(' ', '')
# 结束计时
time_end = time.time()
print("共处理了", total_num, "行xml数据")
print("文件所有字符(字符串)长度为:", len(all_str))
print("文件处理花费了:", time_end - time_start, "秒")
# 正则表达式取字符串 <entry> </entry>
re_str_entry = r'<entry>(.+?)</entry>'
d = re.compile(re_str_entry)
# 取出原文件中所有的 <entry> </entry> 保存到列表中
list_entry = d.findall(all_str)
print('共有' + str(len(list_entry)) + '条<entry>数据')
sums = len(list_entry)
time_end2 = time.time()
print("取正则表达式取字符串<entry> </entry>花费了:", time_end2 - time_end, "秒")
# 遍历每一个entry将其中数据取出来
for data in list_entry:
# 正则表达式取字符串<name> </name>
re_str_name = r'<name>(.+?)</name>'
d = re.compile(re_str_name)
# 取出每个<entry> </entry>中的name字段
name = d.findall(data)
# print(name)
# 正则表达式取字符串<vuln-id> </vuln-id>
re_str_vul_id = r'<vuln-id>(.+?)</vuln-id>'
d = re.compile(re_str_vul_id)
vuln_id = d.findall(data)
# print(vuln_id)
# 正则表达式取字符串<published> </published>
re_str_published = r'<published>(.+?)</published>'
d = re.compile(re_str_published)
published = d.findall(data)
# print(published)
# 正则表达式取字符串<modified> </modified>
re_str_modified = r'<modified>(.+?)</modified>'
d = re.compile(re_str_modified)
modified = d.findall(data)
# print(modified)
# 正则表达式取字符串<source> </source>
re_str_source = r'<source>(.+?)</source>'
d = re.compile(re_str_source)
source = d.findall(data)
# print(source)
# 正则表达式取字符串<severity> </severity>
re_str_severity = r'<severity>(.+?)</severity>'
d = re.compile(re_str_severity)
severity = d.findall(data)
# print(severity)
# 正则表达式取字符串 <vuln-type> </vuln-type>
re_str_vuln_type = r'<vuln-type>(.+?)</vuln-type>'
d = re.compile(re_str_vuln_type)
# 取出每个<vuln-type> </vuln-type>中的字段
vuln_type = d.findall(data)
# print(vuln_type)
# 正则表达式取字符串 <thrtype> </thrtype>
re_str_thrtype = r'<thrtype>(.+?)</thrtype>'
d = re.compile(re_str_thrtype)
thrtype = d.findall(data)
# print(thrtype)
# 正则表达式取字符串 <vuln-descript> </vuln-descript>
re_str_vuln_descript = r'<vuln-descript>(.+?)</vuln-descript>'
d = re.compile(re_str_vuln_descript)
vuln_descript = d.findall(data)
# print(vuln_descript)
# 正则表达式取字符串 <product> </product>
re_str_product = r'<product>CPE:/(.+?):</product>'
d = re.compile(re_str_product)
# 注意product可能有多个
product = d.findall(data)
# print(product)
# 正则表达式取字符串 <vuln-solution> </vuln-solution>
re_str_vuln_solution = r'<vuln-solution>(.+?)</vuln-solution>'
d = re.compile(re_str_vuln_solution)
# 注意vuln_solution可能有多个
vuln_solution = d.findall(data)
# print(vuln_solution)
entry_tuple = (
name, vuln_id, published, modified, source, severity, vuln_type, thrtype, vuln_descript, product, vuln_solution)
# 将每一个<entry> </entry>中的数据按既定顺序存到一个元祖中
# 下一步再将所有的元组放到一个字典中,最后将字典的数据一条条放到数据库中
# print(entry_tuple[0:-1])
tuple_list.append(entry_tuple[0:])
print(len(tuple_list))
# 制作一个和tuple_list一样长的列表
for i in range(len(tuple_list)):
id.append(i)
# 形成字典类型
cve_dict = dict(zip(id, tuple_list))
time_end3 = time.time()
print("循环处理所有的<entry>花费了:", time_end3 - time_end2, "秒")
# 开始写入es数据库
# 批量写入数据
action = [
{
"_index": "fask_entry_bulk",
"_type": "doc",
"_source": {
"id": i,
'name': cve_dict[i][0],
"vuln-id": cve_dict[i][1],
"published": cve_dict[i][2],
"modified": cve_dict[i][3],
"source": cve_dict[i][4],
"severity": cve_dict[i][5],
"vuln-type": cve_dict[i][6],
"thrtype": cve_dict[i][7],
"vuln-descript": cve_dict[i][8],
"product": cve_dict[i][9][0:],
"vuln-solution": cve_dict[i][10]
}
} for i in range(len(tuple_list))]
helpers.bulk(es, action)
print("向数据库中插入花费了:", time.time() - time_end3, "秒")
file.close()
运行结果:
插入数据后在数据库中查看:文章来源:https://www.toymoban.com/news/detail-596506.html
文章来源地址https://www.toymoban.com/news/detail-596506.html
到了这里,关于Python简单实现与ElasticSearch交互插入数据的文章就介绍完了。如果您还想了解更多内容,请在右上角搜索TOY模板网以前的文章或继续浏览下面的相关文章,希望大家以后多多支持TOY模板网!