4 Python Scripts เปลี่ยน NotebookLM เป็นเครื่องสรุปข้อมูลระดับโปร เร็วขึ้น 10 เท่า

13 ธันวาคม 2568 ทีม XDA-Developers พูดถึง การใช้ NotebookLM เก่งขึ้นและเร็วขึ้น ด้วยการเอา Python scripts ง่าย ๆ มาช่วยงานที่ทำซ้ำบ่อย ๆ แทนที่จะไปหาเครื่องมือใหม่เพิ่มเรื่อย ๆ มี 4 อย่างใน Python Scripts

1. Web Scraper ดึงคอนเทนต์จากเว็บไหนก็ได้แบบไว ๆ

หัวใจของการใช้ NotebookLM คือ เติม notebook ด้วย Sources ที่คุณภาพดี ทีม XDA-Developers ได้ยินคนพูดบ่อยมากว่า การอัปโหลด sources เป็นไฟล์ .txt มักดีกว่าการใช้ PDFs เพราะไฟล์ข้อความมัน parse และ search ใน NotebookLM ได้ง่ายกว่า เพราะงั้น Script ที่ฉันใช้หนักมากก็คือ Script ที่ดึงข้อความจากลิสต์ URLs แล้วเซฟเป็น .txt นั่นเอง

Code:

import requests
from bs4 import BeautifulSoup
import re
import os def clean_html_content(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
for element in soup(["script", "style", "header", "footer", "nav", "aside", "form", "button", "iframe", "img"]):
element.decompose()
text = soup.get_text()
text = re.sub(r'\n\s*\n', '\n\n', text)
text = re.sub(r' +', ' ', text)
return text.strip() def scrape_and_save(urls):
output_dir = "notebooklm_sources_web"
if not os.path.exists(output_dir):
os.makedirs(output_dir)
print(f"Saving cleaned articles to the '{output_dir}' directory...")
for i, url in enumerate(urls):
try:
print(f"Fetching: {url}")
response = requests.get(url, timeout=10)
response.raise_for_status()
clean_text = clean_html_content(response.content)
title_soup = BeautifulSoup(response.content, 'html.parser')
page_title = title_soup.title.string if title_soup.title else f"article_{i+1}"
filename_base = re.sub(r'[\\/:*?"|]', '_', page_title).strip()
filename = f"{filename_base[:50].strip() or f'article_{i+1}'}.txt"
output_path = os.path.join(output_dir, filename)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(clean_text)
print(f" -> Success: Saved as {filename}")
except requests.exceptions.RequestException as e:
print(f" -> Error fetching {url}: {e}")
except Exception as e:
print(f" -> An unexpected error occurred with {url}: {e}") research_urls = [
# Insert URLs here
"https://www.xda-developers.com/proton-launches-excel-and.../",
"https://www.xda-developers.com/im-never-going-back-to.../",
] scrape_and_save(research_urls)

ตัว Script จะไป fetch หน้าเว็บจากอินเทอร์เน็ต แล้วจัดการทำความสะอาดข้อความ เช่น ลบช่องว่างเกิน ๆ, ลบ blank lines

ดึงข้อความจากหลาย ๆ เว็บไซต์ (URLs)

ล้างของรก ๆ เช่น scripts, styles, header, ปุ่ม, ฟอร์ม, รูป

เซฟเป็นไฟล์ .txt

เหตุผล: .txt มัก parse/search ใน NotebookLM ได้ง่ายกว่า PDF

ใส่ URLs ได้กี่อันก็ได้ในตัวแปร research_urls แล้วมันจะ clean ให้ทีละหน้า และเซฟเป็นไฟล์ .txt แยกกันทีละไฟล์

2. YouTube Transcript Scraper แปลง VDO เป็น Text ในไม่กี่วินาที

เหตุผลเหมือนกับ Web Scraper ตัว Script ที่ใช้กับ Youtube รองรับได้หลายภาษา และจะจัด paragraph ให้อ่านง่ายแบบอัตโนมัติ มันจะ clean line breaks กับ whitespace ที่เกิน ๆ แล้วเซฟ transcript ออกมาเป็นไฟล์ .txt ที่เรียบร้อยมา

วิธีใช้ก็แค่รันคำสั่ง command line นี้:

python youtube_transcript.py "link here" -o transcript.txt
.
#!/usr/bin/env python3
.
✨ Code:
import argparse
import re
import sys
try:
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound
except ImportError:
print("Required package not found. Install with:")
print(" pip install youtube-transcript-api")
sys.exit(1)
def extract_video_id(url_or_id: str) -> str:
patterns = [
r'(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/)([a-zA-Z0-9_-]{11})',
r'^([a-zA-Z0-9_-]{11})$'
]
for pattern in patterns:
match = re.search(pattern, url_or_id)
if match:
return match.group(1)
raise ValueError(f"Could not extract video ID from: {url_or_id}")
def format_timestamp(seconds: float) -> str:
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = int(seconds % 60)
if hours > 0:
return f"{hours:02d}:{minutes:02d}:{secs:02d}"
return f"{minutes:02d}:{secs:02d}"
def get_transcript(video_id: str, languages: list = None, include_timestamps: bool = False) -> str:
api = YouTubeTranscriptApi()
if languages:
transcript_list = api.list(video_id)
transcript = transcript_list.find_transcript(languages)
transcript_data = transcript.fetch()
else:
try:
transcript_data = api.fetch(video_id, languages=['en'])
except NoTranscriptFound:
transcript_data = api.fetch(video_id)
if include_timestamps:
lines = []
for entry in transcript_data:
timestamp = format_timestamp(entry.start)
text = entry.text.replace('\n', ' ')
lines.append(f"[{timestamp}] {text}")
return '\n'.join(lines)
else:
texts = [entry.text.replace('\n', ' ') for entry in transcript_data]
paragraphs = []
current_paragraph = []
for text in texts:
current_paragraph.append(text)
combined = ' '.join(current_paragraph)
if len(combined.split()) > 100 or (text.rstrip().endswith(('.', '?', '!')) and len(current_paragraph) > 3):
paragraphs.append(combined)
current_paragraph = []
if current_paragraph:
paragraphs.append(' '.join(current_paragraph))
return '\n\n'.join(paragraphs)
def get_video_info(video_id: str) -> dict:
return {
'video_id': video_id,
'url': f"https://www.youtube.com/watch?v={video_id}"
}
def main():
parser = argparse.ArgumentParser(
description='Download YouTube transcripts for NotebookLM',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python youtube_transcript.py "https://youtube.com/watch?v=VIDEO_ID"
python youtube_transcript.py VIDEO_ID -o transcript.txt
python youtube_transcript.py VIDEO_ID --timestamps
python youtube_transcript.py VIDEO_ID --language es
"""
)
parser.add_argument('video', help='YouTube URL or video ID')
parser.add_argument('-o', '--output', help='Output file (default: print to stdout)')
parser.add_argument('--timestamps', action='store_true', help='Include timestamps in output')
parser.add_argument('--language', '-l', help='Preferred language code (e.g., en, es, fr)')
parser.add_argument('--list-languages', action='store_true', help='List available transcript languages')
args = parser.parse_args()
try:
video_id = extract_video_id(args.video)
api = YouTubeTranscriptApi()
if args.list_languages:
transcript_list = api.list(video_id)
print("Available transcripts:")
for transcript in transcript_list:
auto = " (auto-generated)" if transcript.is_generated else ""
print(f" {transcript.language_code}: {transcript.language}{auto}")
return
languages = [args.language] if args.language else None
transcript = get_transcript(video_id, languages, args.timestamps)
video_info = get_video_info(video_id)
output = f"""# YouTube Video Transcript
**Video URL:** {video_info['url']}
**Video ID:** {video_info['video_id']}
---
{transcript}
"""
if args.output:
with open(args.output, 'w', encoding='utf-8') as f:
f.write(output)
print(f"Saved to {args.output}")
else:
print(output)
except TranscriptsDisabled:
print("Error: Transcripts are disabled for this video", file=sys.stderr)
sys.exit(1)
except NoTranscriptFound:
print("Error: No transcript found for this video", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()

แล้วก็บันทึกกเป็น .txt เป้าหมาย คือ ใช้ NotebookLM ช่วยอ่าน/สรุป/ถามตอบจากวิดีโอได้เหมือนมีเอกสาร

3. File Splitter จากไฟล์ใหญ่ → ชิ้นเล็ก ๆ ที่จัดการง่าย

NotebookLM แต่ละ source รองรับได้สูงสุด 500,000 คำ ถ้า source ยาวเกินไป มันทำให้การ Search ช้าลง และดูเหมือน NotebookLM จะเริ่มงอแงกับการ parse ให้ดี ๆ ด้วย
แทนที่จะมานั่งแยกไฟล์เอง ฉันใช้ Script ที่ช่วยหั่นไฟล์ข้อความ ให้เป็นชิ้นเล็ก ๆ ที่ขนาดไม่เข้าใกล้ 500,000-word limit

Code:

import os
MAX_CHARS_PER_FILE = 400000
INPUT_DIR = "files_to_split"
OUTPUT_DIR = "notebooklm_sources_split"
def split_file(input_path, output_dir):
filename = os.path.basename(input_path)
base_name, ext = os.path.splitext(filename)
if ext.lower() not in ['.txt', '.csv', '.md', '.log']:
return
try:
with open(input_path, 'r', encoding='utf-8') as f:
content = f.read()
except:
return
words = content.split()
current_chunk = []
current_char_count = 0
file_count = 1
for word in words:
if current_char_count + len(word) + 1 > MAX_CHARS_PER_FILE:
chunk_content = ' '.join(current_chunk)
output_filename = f"{base_name}_Part{file_count}{ext}"
output_path = os.path.join(output_dir, output_filename)
with open(output_path, 'w', encoding='utf-8') as out_f:
out_f.write(chunk_content)
current_chunk = [word]
current_char_count = len(word)
file_count += 1
else:
current_chunk.append(word)
current_char_count += len(word) + 1
if current_chunk:
chunk_content = ' '.join(current_chunk)
output_filename = f"{base_name}_Part{file_count}{ext}"
output_path = os.path.join(output_dir, output_filename)
with open(output_path, 'w', encoding='utf-8') as out_f:
out_f.write(chunk_content)
def process_directory(input_dir, output_dir):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
if not os.path.exists(input_dir):
os.makedirs(input_dir)
return
for filename in os.listdir(input_dir):
input_path = os.path.join(input_dir, filename)
if os.path.isfile(input_path):
split_file(input_path, output_dir)
process_directory(INPUT_DIR, OUTPUT_DIR)

Script นี้ใช้ได้กับไฟล์ .txt, .csv, .md, และ .log และจะบันทึกไฟล์ที่ Split เป็นไฟล์ใหม่ โดยเติมท้ายชื่อว่า _Part1, _Part2, ฯลฯ เพื่อให้อัปโหลดเข้า NotebookLM ได้ง่าย

4. File Format Converter ไม่ต้องพึ่ง online converters อีกต่อไป

การต้องเปิดเว็บ อัปโหลด รอแปลง แล้วค่อยดาวน์โหลดเอกสารมาแปลงไฟล์ เพื่อทำให้เอกสารออกมาเป็น format ที่ใช้งานได้พอดี ๆ มันทำให้ workflow สะดุด และเพิ่มแรงเสียดทานแบบไม่จำเป็น
Script ด้านล่างช่วย convert ไฟล์ไป–กลับได้กับ format ที่ใช้บ่อย เช่น TXT, Markdown, DOCX, HTML, และแม้แต่ PDFs ทุกอย่างเกิดขึ้นบนคอมของคุณเอง และใช้เวลาแค่ไม่กี่วินาที

Code:

#!/usr/bin/env python3
import argparse, sys, re
from pathlib import Path
DOCX_AVAILABLE = False
MARKDOWNIFY_AVAILABLE = False
PDF_AVAILABLE = False
try: from docx import Document; DOCX_AVAILABLE = True
except ImportError: pass
try: from markdownify import markdownify; MARKDOWNIFY_AVAILABLE = True
except ImportError: pass
try: import PyPDF2; PDF_AVAILABLE = True
except ImportError: pass
def txt_to_md(text, title=None):
lines=text.split('\n'); result=[]
if title: result.append(f"# {title}\n")
for line in lines:
line=line.rstrip()
if line.isupper() and 3 elif line.endswith(':') and len(line) else: result.append(line)
return '\n'.join(result)
def html_to_md(html):
if not MARKDOWNIFY_AVAILABLE: print("Error: markdownify not installed", file=sys.stderr); sys.exit(1)
return markdownify(html, heading_style='ATX', strip=['script','style'])
def md_to_txt(markdown):
text=markdown
text=re.sub(r'^#{1,6}\s+','',text,flags=re.MULTILINE)
text=re.sub(r'\*\*([^*]+)\*\*',r'\1',text)
text=re.sub(r'\*([^*]+)\*',r'\1',text)
text=re.sub(r'__([^_]+)__',r'\1',text)
text=re.sub(r'_([^_]+)_',r'\1',text)
text=re.sub(r'\[([^\]]+)\]\([^\)]+\)',r'\1',text)
text=re.sub(r'!\[([^\]]*)\]\([^\)]+\)',r'\1',text)
text=re.sub(r'```[\s\S]*?```','',text)
text=re.sub(r'`([^`]+)`',r'\1',text)
text=re.sub(r'^>\s*','',text,flags=re.MULTILINE)
text=re.sub(r'^[-*_]{3,}\s*$','',text,flags=re.MULTILINE)
text=re.sub(r'\n{3,}','\n\n',text)
return text.strip()
def docx_to_md(path):
if not DOCX_AVAILABLE: print("Error: python-docx not installed",file=sys.stderr); sys.exit(1)
doc=Document(path); lines=[]
for para in doc.paragraphs:
text=para.text.strip()
if not text: lines.append(''); continue
style=para.style.name.lower() if para.style else ''
if 'heading 1' in style or 'title' in style: lines.append(f"# {text}")
elif 'heading 2' in style: lines.append(f"## {text}")
elif 'heading 3' in style: lines.append(f"### {text}")
elif 'heading' in style: lines.append(f"#### {text}")
else:
ft=[]; [ft.append(f"***{r.text}***" if r.bold and r.italic else f"**{r.text}**" if r.bold else f"*{r.text}*" if r.italic else r.text) for r in para.runs]
lines.append(''.join(ft))
return '\n\n'.join(lines)
def docx_to_txt(path):
if not DOCX_AVAILABLE: print("Error: python-docx not installed",file=sys.stderr); sys.exit(1)
doc=Document(path); return '\n\n'.join(p.text for p in doc.paragraphs if p.text.strip())
def md_to_docx(md, out):
if not DOCX_AVAILABLE: print("Error: python-docx not installed",file=sys.stderr); sys.exit(1)
doc=Document(); lines=md.split('\n'); i=0
while i line=lines[i]
m=re.match(r'^(#{1,6})\s+(.+)$',line)
if m: doc.add_heading(m.group(2),level=min(len(m.group(1)),9)); i+=1; continue
if line.startswith('```'):
code=[]; i+=1
while i if code: doc.add_paragraph('\n'.join(code)).style='Quote'
i+=1; continue
if line.strip(): line=re.sub(r'\*\*([^*]+)\*\*',r'\1',line); line=re.sub(r'\*([^*]+)\*',r'\1',line); line=re.sub(r'\[([^\]]+)\]\([^\)]+\)',r'\1',line); doc.add_paragraph(line)
i+=1
doc.save(out)
def pdf_to_txt(path):
if not PDF_AVAILABLE: print("Error: PyPDF2 not installed",file=sys.stderr); sys.exit(1)
t=[]; f=open(path,'rb'); r=PyPDF2.PdfReader(f)
[t.append(p.extract_text() or '') for p in r.pages]; f.close()
return '\n\n'.join(t).strip()
def pdf_to_md(path): return txt_to_md(pdf_to_txt(path),Path(path).stem)
def get_format(path):
ext=Path(path).suffix.lower()
return {'.txt':'txt','.md':'md','.markdown':'md','.html':'html','.htm':'html','.docx':'docx','.pdf':'pdf'}.get(ext,'txt')
def convert(path,to):
f=get_format(path)
if f==to: return open(path,'r',encoding='utf-8').read(),f'.{to}'
if f=='txt' and to=='md': return txt_to_md(open(path,'r',encoding='utf-8').read(),Path(path).stem),'.md'
if f=='md' and to=='txt': return md_to_txt(open(path,'r',encoding='utf-8').read()),'.txt'
if f=='html' and to=='md': return html_to_md(open(path,'r',encoding='utf-8').read()),'.md'
if f=='html' and to=='txt': return md_to_txt(html_to_md(open(path,'r',encoding='utf-8').read())),'.txt'
if f=='docx' and to=='md': return docx_to_md(path),'.md'
if f=='docx' and to=='txt': return docx_to_txt(path),'.txt'
if f=='md' and to=='docx': return None,'.docx'
if f=='pdf' and to=='txt': return pdf_to_txt(path),'.txt'
if f=='pdf' and to=='md': return pdf_to_md(path),'.md'
raise ValueError(f"Conversion from {f} to {to} not supported")
def main():
p=argparse.ArgumentParser(description='Convert formats for NotebookLM (supports PDFs)',formatter_class=argparse.RawDescriptionHelpFormatter)
p.add_argument('input_files',nargs='+'); p.add_argument('--to',required=True,choices=['txt','md','html','docx'])
p.add_argument('-o','--output'); p.add_argument('--batch',action='store_true')
a=p.parse_args()
try:
if a.batch or len(a.input_files)>1:
od=Path(a.output) if a.output else Path('converted'); od.mkdir(parents=True,exist_ok=True)
for f in a.input_files:
ip=Path(f)
if not ip.exists(): print(f"Warning: {f} not found, skipping",file=sys.stderr); continue
c,ext=convert(f,a.to); of=od/f"{ip.stem}{ext}"
if a.to=='docx' and c is None: md_to_docx(open(f,'r',encoding='utf-8').read(),str(of))
else: open(of,'w',encoding='utf-8').write(c)
print(f"Converted: {f} -> {of}")
else:
f=a.input_files[0]; c,ext=convert(f,a.to); of=a.output if a.output else Path(f).stem+ext
if a.to=='docx' and c is None: md_to_docx(open(f,'r',encoding='utf-8').read(),of)
else: open(of,'w',encoding='utf-8').write(c)
print(f"Saved to {of}")
except Exception as e: print(f"Error: {e}",file=sys.stderr); sys.exit(1)
if __name__=='__main__': main()

มันยังรองรับ batch conversions (แปลงทีละหลายไฟล์พร้อมกัน) และสามารถสร้าง output folder ให้ได้ถ้าต้องการ

ข้อสรุป:

ใช้ Scripts 4 ตัว ช่วยดึงเว็บกับ YouTube ออกมาเป็นไฟล์อ่านง่าย แปลงและแบ่งไฟล์ให้พอดี เพื่ออัปโหลดเข้า NotebookLM ได้ลื่นขึ้น แล้วเอาเวลาไปโฟกัสการสรุปและต่อยอดไอเดียให้คมกว่าเดิม

Source:

XDA-Developers

4 Python Scripts เปลี่ยน NotebookLM เป็นเครื่องสรุปข้อมูลระดับโปร เร็วขึ้น 10 เท่า

1. Web Scraper ดึงคอนเทนต์จากเว็บไหนก็ได้แบบไว ๆ

2. YouTube Transcript Scraper แปลง VDO เป็น Text ในไม่กี่วินาที

3. File Splitter จากไฟล์ใหญ่ → ชิ้นเล็ก ๆ ที่จัดการง่าย

4. File Format Converter ไม่ต้องพึ่ง online converters อีกต่อไป

ข้อสรุป:

Source:

Related Contents

4 ฉากทัศน์เศรษฐกิจโลกปี 2030 จาก WEF เลือกผิด ส่งผลถึงธุรกิจสะดุดทั้งระบบ + วิธีรับมือ

5 เทรนด์ AI Agents ปี 2026 จาก Google ทำให้คนเก่งขึ้นด้านเพิ่มโปรดักทีฟ, บริการไว, ปลอดภัยขึ้น & Teamwork มากขึ้น

4 วิธีทำ Agentic AI ให้เก่งขึ้น-นิ่งขึ้นแบบเลือกใช้ได้ทันที ลดพัง เพิ่มความเสถียรมากขึ้น

3 สูตรทำให้ AI ใช้ไฟคุ้มจาก WEF ด้วยสูตร Net-Positive AI ใน 5 นาที